def checkSearchStatus(mc,msgSize,msgs,custId,userId,sleepSecs): if msgs is not None: msgIds = [] for msg in msgs: msgIds.append(Long(msg.getMessageId())) if msgSize != len(msgIds): print "msgSize != msg array size" return None isc = IndexSearchConstraint(custId,userId) if msgs is not None: isc.constrainByStorageIds(msgIds) sm = mc.getIndexSearchManager() retries = 20 done = False sr = None while not done and retries > 0: sleep(sleepSecs) retries = retries - 1 sr = sm.search('',isc,None,CallerApp.TESTING) done = (sr is not None) and (sr.getDocCount() == msgSize) if sr is not None: print 'Search found messages: ' + str(sr.getDocCount()) if not done: lastCount = 0 if sr is not None: lastCount = sr.getDocCount() print 'Search failed to find messages',lastCount return done
def checkSearchStatus3(mc,msgs,custid,shouldExist,query): print 'checkSearchStatus(',query,')' done = False msgIds = [] count = 0 for msg in msgs: print 'Message Id No',count,' ', msg.getMessageId() msgIds.append(Long(msg.getMessageId())) count = count + 1 found = 0 cs = 0 ce = cs + 200 if ce > len(msgIds): ce = len(msgIds) chunk = msgIds[cs:ce] cs = ce sr = None while len(chunk) > 0: isc = IndexSearchConstraint(custid,None) isc.constrainByStorageIds(chunk) sm = mc.getIndexSearchManager() retries = 180 done = False while not done and retries > 0: sr = sm.search(query,isc,None,CallerApp.TESTING) # print 'checkSearchStatus() found',sr.getDocCount() if shouldExist: done = (sr is not None) and (sr.getDocCount() == len(chunk)) else: done = (sr is not None) and (sr.getDocCount() == 0) if not done: time.sleep(5) retries = retries - 1 if not done and retries % 12 == 0: print 'checkSearchStatus() Chunk',sr.getDocCount(),'of',len(chunk),'Total',found,'of',count if sr is not None: found = found + sr.getDocCount() print 'checkSearchStatus() found',found,'of',count if not done: return done # next chunk ce = cs + 200 if ce > len(msgIds): ce = len(msgIds) chunk = msgIds[cs:ce] cs = ce return done
def performSampleAudit(loc,samplePercent,seed): global BATCH_SAMPLE_PCT mc = ManagementContainer.getInstance() cm = mc.getClusterManager() im = mc.getIslandManager() cluster = cm.getCluster(loc.getClusterId()) island = im.getIsland(cluster.getIslandId()) masterURL = URL(loc.getClusterLocationProperty(SolrClusterAdapter.SOLR_MASTER_HOST_URL_PROP)) slaveURL = URL(loc.getClusterLocationProperty(SolrClusterAdapter.SOLR_SLAVE_HOST_URL_PROP)) ssmMaster = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml') ssmMaster.setURL(masterURL) ssmSlave = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml') ssmSlave.setURL(slaveURL) sm = mc.getIndexSearchManager() # note that this will be slow -- especially if samplePercent is large and data set size is large srcIS = IndexSearchConstraint(None,None) srcIS.constrainByNumberOfHitsToReturn(1) srcIS.constrainByIsland(island) destIS = IndexSearchConstraint(None,None) destIS.constrainByNumberOfHitsToReturn(1) destIS.constrainByIsland(island) # get a message count for the customer srcQR = search(MainMessageQuery(),srcIS,ssmSlave) srcDocCount = srcQR.getDocCount() print 'total messages',srcDocCount,'msgs' msgsToSample = int(srcDocCount * samplePercent) if msgsToSample < 100: chunkSize = 10 elif msgsToSample < 5000: chunkSize = 100 else: chunkSize = 1000 print 'Source sample size',msgsToSample,'msgs' # ensure 10% on selections per sample if samplePercent < 0.10: samplePercent = 0.10 # if corpus is large, sample more per chunk if chunkSize >= 1000 and msgsToSample > 50000: samplePercent = 0.5 # perform sample audit sampleCountsFromSource(ssmSlave,ssmMaster,srcIS,destIS,samplePercent,msgsToSample,seed,chunkSize)
def performQuickAudit(location): mc = ManagementContainer.getInstance() clus = mc.getClusterManager().getCluster(location.getClusterId()) isle = mc.getIslandManager().getIsland(clus.getIslandId()) masterURL = URL(location.getClusterLocationProperty(SolrClusterAdapter.SOLR_MASTER_HOST_URL_PROP)) slaveURL = URL(location.getClusterLocationProperty(SolrClusterAdapter.SOLR_SLAVE_HOST_URL_PROP)) ssmMaster = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml') ssmMaster.setURL(masterURL) ssmSlave = MySolrSearchManager(mc.getConfiguration(),'solrconfig.xml') ssmSlave.setURL(slaveURL) srcIS = IndexSearchConstraint(None,None) srcIS.constrainByIsland(isle) srcIS.constrainByNumberOfHitsToReturn(1) destIS = IndexSearchConstraint(None,None) destIS.constrainByIsland(isle) destIS.constrainByNumberOfHitsToReturn(1) srcQR = search('',srcIS,ssmSlave) destQR = search('',destIS,ssmMaster) srcDocCount = srcQR.getDocCount() destDocCount = destQR.getDocCount() srcQR = search(MainMessageQuery(),srcIS,ssmSlave) destQR = search(MainMessageQuery(),destIS,ssmMaster) srcMessages = srcQR.getDocCount() destMessages = destQR.getDocCount() srcQR = search(AttachmentQuery(),srcIS,ssmSlave) destQR = search(AttachmentQuery(),destIS,ssmMaster) srcAttachments = srcQR.getDocCount() destAttachments = destQR.getDocCount() srcQR = search('-isattachment:*',srcIS,ssmSlave) destQR = search('-isattachment:*',destIS,ssmMaster) srcNoAttachments = srcQR.getDocCount() destNoAttachments = destQR.getDocCount() print 'SOURCE',slaveURL,'DEST',masterURL print 'SOURCE Documents:',srcDocCount,'DEST Documents:',destDocCount print 'SOURCE Messages:',srcMessages,'DEST Messages:',destMessages print 'SOURCE Attachments:',srcAttachments,'DEST Attachments:',destAttachments print 'SOURCE (test data):',srcNoAttachments,'DEST (test data):',destNoAttachments
def getDocuments(ism, isle, storageId, locationId): iss = IndexSearchConstraint(None,None) iss.constrainByIsland(isle) iss.constrainByStorageIds([Long(storageId)]) iss.setShardConstraint(locationId) iss.setOutputFields(['*']) qr = ism.search('',iss, None, CallerApp.INTERNAL) results = [] for doc in qr: results.append(doc) return results
def searchCount(mc,custid,keywords,sender,receiver,subject,attachment,language,attach_scope,rg = None): sm = mc.getIndexSearchManager() isc = IndexSearchConstraint(custid,None) if None != rg : isc.constrainByReviewerGroup(rg) qb = FastQueryBuilder(custid); qb.applyLanguage(language) qb.applyAttachmentScope(attach_scope) qb.applyDefaultSearch(keywords,False,False) qb.applyFileName(attachment,None) qb.applyRecipients(receiver,False) qb.applyFrom(sender) qb.applySubject(subject) sr = sm.search(qb.getQuery(),isc,None,CallerApp.TESTING) return sr.getDocCount()
def migrate(custIds,islandId,sourceIslandId,minRange,maxRange,showAndExitOnly=False): global QUEUE,MsgsProcessed,StartCount,TotSearchTime mc = ManagementContainer.getInstance() ism = mc.getIndexSearchManager() im = mc.getIslandManager() sourceIsland = im.getIsland(int(sourceIslandId)) sourceType = sourceIsland.getIndexPlatformVersion() # search for documents isc = IndexSearchConstraint(None,None) isc.sortBy('storageid',True) isc.constrainByMinimumStorageId(minRange) isc.constrainByIsland(mc.getIslandManager().getIsland(sourceIslandId)) if IndexPlatformVersion.SOLR_x == sourceType : query = buildSolrQuery(custIds, maxRange) else: query = buildFastQuery(custIds, maxRange) start = time.time() sr = search(query,isc,ism) StartCount = sr.getDocCount() elapsed = time.time() - start TotSearchTime = TotSearchTime + elapsed log('PERF: search',StartCount,'msgs, time',elapsed,'s','total',TotSearchTime,'s') if showAndExitOnly: return hitsRemaining = ism.getMaxIndextHits() lastStorageId = 0 iter = sr.documents().iterator() while checkSearchIter(iter): doc = iter.next() hitsRemaining -= 1 storageId = doc.getStorageID() # only process each EMS message ID one time if storageId != lastStorageId: QUEUE.put(doc) lastStorageId = storageId if hitsRemaining <= 0 or not checkSearchIter(iter): log('STAT: getting new search results beyond storageId',storageId) isc.constrainByMinimumStorageId(storageId) start = time.time() sr = search(query + ' and isattachment:0',isc,ism) elapsed = time.time() - start TotSearchTime = TotSearchTime + elapsed log('PERF: search time',elapsed,'s','total',TotSearchTime,'s') hitsRemaining = ism.getMaxIndextHits() iter = sr.documents().iterator() log('STAT: search work complete')
def analyze(islandId,timeBoundary,commit,type): mc = ManagementContainer.getInstance() im = mc.getIslandManager() isle = im.getIsland(islandId) fIS = IndexSearchConstraint(None,None) fIS.constrainByIsland(isle) fIS.constrainByType(type) ism = mc.getIndexSearchManager() qb = MyCriteria("isattachment:0 AND processingtime:[%s TO *]" % timeBoundary) qr = ism.scaledSearch(qb,fIS,CallerApp.INTERNAL) for doc in qr: pass dupsById = qr.getDuplicates() print time.asctime(),'Found',dupsById.size(),type,'duplicates in island',islandId,'total docs',qr.getDocCount() for me in dupsById.entrySet(): storageId = me.getKey() locIds = me.getValue() data = {} for locId in locIds: docs = getDocuments(ism,isle,storageId,locId) byId = {} for doc in docs: byId[doc.getString(IIndexSearchSchema.FIELD_CONTENT_ID)] = doc data[locId] = byId try: dupData = [] if commit is True: (found,dupData) = simpleValidateData(data) if not found: print time.asctime(),'The DB has no matching record for any of the documents found in the archive. Skipping',storageId continue else: dupData = validateData(data) for storageId,locId in dupData: print time.asctime(),'Deleting documents for',storageId,'on location',locId deleteDuplicate(locId,storageId,commit,type) except Exception,e: print time.asctime(),'Not recommending any change for storage ID',storageId,'due to validation value',e
def test(islandId,custName,numMessages): mc = ManagementContainer.getInstance() custid = findCustomer(custName) if custid < 0: print 'test failed because customer',custName,'was not found' return 1 island = mc.getIslandManager().getIsland(islandId) try: msgs = findMessages(mc,custid,numMessages) if msgs.size() < numMessages: print 'Fail, Did not find all messages stored, only found', msgs.size() return 1 if not checkSearchStatus(mc,msgs,custid): print 'Fail, could not find all messages in search index' return 1 ism = mc.getIndexSearchManager() isc = IndexSearchConstraint(custid,None) isc.constrainByNumberOfHitsToReturn(1) isc.constrainByIsland(island) isc.constrainByOffset(0) try: results = ism.resolveLocations(isc) print 'Fail, expected IndexSearchException' return 1 except IndexSearchException, e: print 'Pass, IndexSearchException',e except: print 'Expected IndexSearchException, but got',sys.exc_info(),traceback.print_exc(file=sys.stderr) return 1 isc.constrainByNumberOfHitsToReturn(2) isc.constrainByOffset(0) try: results = ism.resolveLocations(isc) sz = getResultSize(results) print 'Pass, hits=2',sz if sz < numMessages: print 'Wrong number of results. Expected >=',numMessages,'Got',sz except: print 'Unexpected exception caught when hits was set to 2',sys.exc_info(),traceback.print_exc(file=sys.stderr) return 1 isc.constrainByNumberOfHitsToReturn(numMessages) isc.constrainByOffset(0) try: results = ism.resolveLocations(isc) sz = getResultSize(results) print 'Pass, hits=',numMessages,sz if sz < numMessages: print 'Wrong number of results. Expected >=',numMessages,'Got',sz,results.getResults() except: print 'Unexpected exception caught when hits was set to',numMessages,sys.exc_info(),traceback.print_exc(file=sys.stderr) return 1
def search(isle, term): mc = ManagementContainer.getInstance() sm = mc.getIndexSearchManager() im = mc.getIslandManager() pm = mc.getPartitionManager() fIS = IndexSearchConstraint(None, None) fIS.constrainByIsland(isle) fIS.constrainByNumberOfHitsToReturn(10) fIS.constrainByMinimumStorageId(0) fIS.setOutputFields(["storageid", "partitionid"]) fIS.sortBy("storageid", True) done = False cnt = 0 lastID = 0 while not done: ok = False while not ok: try: fQR = sm.search(term, fIS, None, CallerApp.INTERNAL) ok = True except Throwable, t: print "Exception caught during search, retry = true", t t.printStackTrace() numDocs = fQR.getDocCount() print "Found numDocs", numDocs done = fQR.getDocCount() == 0 print "Preview some data" for doc in fQR.documents(): # print doc.getPartitionID(),pm.getPartition(doc.getPartitionID()).isReadOnly(),doc.getStorageID(),doc.getReceivedDate() print doc.getPartitionID(), doc.getStorageID() lastID = doc.getStorageID() cnt += 1 fIS.constrainByMinimumStorageId(lastID) # just loop once done = True
def performSampleAudit(custIds,samplePercent,seed): global BATCH_SAMPLE_PCT mc = ManagementContainer.getInstance() cm = mc.getCustomerManager() sm = mc.getIndexSearchManager() # note that this will be slow -- especially if samplePercent is large and data set size is large for custId in custIds: cust = cm.getCustomer(custId) srcIS = IndexSearchConstraint(None,None) srcIS.constrainByIsland(cust.getOldFeedIsland()) srcIS.constrainByNumberOfHitsToReturn(1) srcIS.constrainByCustomerId(custId) destIS = IndexSearchConstraint(None,None) destIS.constrainByIsland(cust.getFeedIsland()) destIS.constrainByNumberOfHitsToReturn(1) destIS.constrainByCustomerId(custId) # get a message count for the customer srcQR = searchAndWrap(MainMessageQuery(),srcIS,sm) srcDocCount = srcQR.getDocCount() print 'Customer',custId,'total messages',srcDocCount,'msgs' msgsToSample = int(srcDocCount * samplePercent) if msgsToSample < 100: chunkSize = 10 elif msgsToSample < 5000: chunkSize = 100 else: chunkSize = 1000 print 'Customer',custId,'Source sample size',msgsToSample,'msgs' # ensure 10% on selections per sample if samplePercent < 0.10: samplePercent = 0.10 # if corpus is large, sample more per chunk if chunkSize >= 1000 and msgsToSample > 50000: samplePercent = 0.5 # perform sample audit sampleCountsFromSource(sm,srcIS,destIS,samplePercent,msgsToSample,seed,chunkSize)
def performQuickAudit(custIds): mc = ManagementContainer.getInstance() cm = mc.getCustomerManager() sm = mc.getIndexSearchManager() for custId in custIds: cust = cm.getCustomer(custId) srcIS = IndexSearchConstraint(None,None) srcIS.constrainByIsland(cust.getOldFeedIsland()) srcIS.constrainByNumberOfHitsToReturn(1) srcIS.constrainByCustomerId(custId) destIS = IndexSearchConstraint(None,None) destIS.constrainByIsland(cust.getFeedIsland()) destIS.constrainByNumberOfHitsToReturn(1) destIS.constrainByCustomerId(custId) srcQR = searchAndWrap('',srcIS,sm) destQR = search('',destIS,sm) srcDocCount = srcQR.getDocCount() destDocCount = destQR.getDocCount() srcQR = searchAndWrap(MainMessageQuery(),srcIS,sm) destQR = search(MainMessageQuery(),destIS,sm) srcMessages = srcQR.getDocCount() destMessages = destQR.getDocCount() srcQR = searchAndWrap(AttachmentQuery(),srcIS,sm) destQR = search(AttachmentQuery(),destIS,sm) srcAttachments = srcQR.getDocCount() destAttachments = destQR.getDocCount() print 'Customer',custId,'SOURCE Documents:',srcDocCount,'DEST Documents:',destDocCount print 'Customer',custId,'SOURCE Messages:',srcMessages,'DEST Messages:',destMessages print 'Customer',custId,'SOURCE Attachments:',srcAttachments,'DEST Attachments:',destAttachments
def testArchive(numMessages, numFound, query = ''): basePath = "" mc = ManagementContainer.getInstance() custList = mc.getCustomerManager().findCustomers([SearchConstraint(ICustomerManager.PROP_NAME, SearchConstraintOperator.CONSTRAINT_EQUALS, custname)]) customerId = custList[0].getCustID() print time.asctime(), "Customer Id:", customerId reviewer = mc.getUserManager().findUserForEmail(users[0] + '@' + domainName) reviewerId = reviewer.getUserID() reviewerGroup = mc.getReviewerGroupManager().getReviewerGroup(customerId, REVIEWER_GROUP_NAME) if reviewerGroup is None: print time.asctime(), 'reviewer group not found' sys.exit(1) # wait for all msgs to be stored msgs = findMessages(mc, customerId, numMessages, True) # wait for all msgs to be indexed and searchable print time.asctime(), 'waiting for all messages to be searchable' if not checkSearchStatus(mc,msgs,customerId): print time.asctime(), 'messages were not searchable in the alotted time' sys.exit(1) print time.asctime(), 'all messages searchable:', [m.getMessageId() for m in msgs] # get list of msg IDs that satisfy query foundMsgIDs = [] ism = mc.getIndexSearchManager() isc = IndexSearchConstraint(customerId, None) isc.constrainByNumberOfHitsToReturn(2*numMessages) qs = UQLSearchCriteria(query, False) rs = ism.search(qs, isc, None, CallerApp.RECOVERY_ARCHIVE) for m in rs: foundMsgIDs.append(m.getStorageID()) print time.asctime(),'found messages:', foundMsgIDs # Creating e-discovery archive archive, result, rm = buildArchive(customerId, mc, numFound, reviewerId, reviewerGroup, UQLQuery=query)# download export chunks if result is True: print time.asctime(), 'exporting archive...' basePath = "/tmp/" + str(customerId) if os.path.exists(basePath) : shutil.rmtree(basePath) os.makedirs(basePath) out = FileOutputStream(basePath + "/archive.zip") rm.createPerUserActiveRecoveryArchiveFile(customerId, archive.getFile().getName(), reviewerId, SimpleOutputStreamWrapper(out), None) out.close() if os.system("cd " + basePath + "; unzip archive.zip") != 0: print time.asctime(), 'failed to unzip ' + basePath + '/archive.zip' result = False # verify exported messages if result is True: print time.asctime(), 'verifying exported messages...' if os.path.exists(os.path.join(basePath, 'nomessagefound.txt')): print time.asctime(), 'Archive was empty. No messages found by RecoveryManager' result = False else: archiveMsgs = len(os.listdir(basePath + '/' + str(reviewerId) + '/inbox')) / 2 if archiveMsgs != numFound: print time.asctime(), 'Archive message count incorrect:', numFound, '!=', archiveMsgs result = False for msgId in foundMsgIDs: archiveFile = basePath + '/' + str(reviewerId) + '/inbox/' + str(msgId) if not os.path.isfile(archiveFile + '.gz'): print time.asctime(), 'Archive message not present: ' + archiveFile + '.gz' result = False if not os.path.isfile(archiveFile + '.key'): print time.asctime(), 'Archive key not present: ' + archiveFile + '.key' result = False # verify EDRM if result is True: print time.asctime(), 'verifying EDRM XML...' minStorageId = min(foundMsgIDs) maxStorageId = max(foundMsgIDs) edrmFile = basePath + '/edrm_v1-2_' + str(minStorageId) + '-' + str(maxStorageId) + '.xml' if not os.path.isfile(edrmFile): print time.asctime(), 'EDRM XML not present: ' + edrmFile result = False # TODO - Add XML schema validation if result is True: print time.asctime(), 'on-premises/cloud message export successful' else: print time.asctime(), 'on-premises/cloud message export failed' if result is True: os.system("rm -rf " + basePath ) sys.exit(0) else: sys.exit(1)
def test(islandId, numMessages, esHosts): mc = ManagementContainer.getInstance() lastVal = None custId = None service = Service("storageimporter") msgs = None try: lastVal, nextVal = setMessageId(mc, Integer.MAX_VALUE - 2) print "lastVal,nextVal", lastVal, nextVal print "restart storageimporters", esHosts service.invoke("restart", esHosts) custId = setupCustomer(mc, islandId) msgs = findMessages(mc, custId, numMessages) print "created messages", msgs checkSearchStatus(mc, msgs, custId) ism = mc.getIndexSearchManager() isc = IndexSearchConstraint(None, None) isc.sortBy("storageid", True) isc.constrainByMinimumStorageId(0) isc.constrainByIsland(mc.getIslandManager().getIsland(islandId)) print "DBG min", 0 query = "customerid:" + str(custId) sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL) if sr.getDocCount() != 10: print >> sys.stderr, "Failed to find 10 documents when minStorageId = 0, found", sr.getDocCount() return 1 print "DBG min", Integer.MAX_VALUE - 1 isc.constrainByMinimumStorageId(Integer.MAX_VALUE - 1) sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL) if sr.getDocCount() != 9: print >> sys.stderr, "Failed to find 9 documents when minStorageId = ", Integer.MAX_VALUE - 1, ",found", sr.getDocCount() return 1 print "DBG min", Integer.MAX_VALUE + 1 isc.constrainByMinimumStorageId(Integer.MAX_VALUE + 1) sr = ism.searchAndWrap(query, isc, CallerApp.INTERNAL) if sr.getDocCount() != 7: print >> sys.stderr, "Failed to find 7 documents when minStorageId = ", Integer.MAX_VALUE + 1, ",found", sr.getDocCount() return 1 finally: if msgs is not None: for msg in msgs: purgeMessage(mc, msg, custId) if custId is not None: mc.getCustomerManager().deleteCustomers([custId]) if lastVal is not None: lastVal, nextVal = setMessageId(mc, lastVal) print "lastVal,nextVal", lastVal, nextVal print "restart storageimporters", esHosts service.invoke("restart", esHosts) return 0