def genExplThreadsCol(threads, config, cacheFilename, expThreadFlag=True): if (len(threads) == 0): return {} prevNow = datetime.now() twts = getDictFromFile(cacheFilename) if (len(twts) == 0): twts = retryParallelTwtsExt( threads, maxRetryCount=config['maxRetryCount'], tweetConvMaxTweetCount=config['tweetConvMaxTweetCount'], maxNoMoreTweetCounter=config['maxNoMoreTweetCounter'], chromedriverPath=config['chromedriverPath'], extraParams=config) if (len(twts) != 0): dumpJsonToFile(cacheFilename, twts, indentFlag=False) else: print('\ngenExplThreadsCol(): read tweets from cache:', cacheFilename) delta = datetime.now() - prevNow twts = updateCache(threads, config, cacheFilename, expThreadFlag, twts) twts = normalizeCol(twts, expThreadFlag) return twts
def main(goldFilname, testFilename=None): goldFilname = goldFilname.strip() if (len(goldFilname) == 0): return prevNow = datetime.now() goldstandard = PrecEval(goldFilname) if (testFilename is not None): tstFile = getDictFromFile(testFilename) PrecEval.getHTMLAndTextForURILst(tstFile, testFilename) tstFile['timestamp'] = getNowTime() tstFile['predicted-precision'] = PrecEval.prlEvalCol( tstFile, goldstandard.goldstandard, removeTxt=False) tstFile['sim-coeff'] = goldstandard.simCoeff dumpJsonToFile(testFilename, tstFile) delta = datetime.now() - prevNow print('\tdelta seconds:', delta.seconds)
def updateCache(threads, config, cacheFilename, expThreadFlag, twts): print('\nupdateCache') if (len(twts) == 0 or len(threads) == 0): print('\tempty returning:', len(twts), len(threads)) return twts cacheURIs = {} newReqURIs = [] for twtCol in twts: cacheURIs[twtCol['self']] = True for uri in threads: if (uri not in cacheURIs): newReqURIs.append(uri) print('\tnew uris:', len(newReqURIs)) if (len(newReqURIs) == 0): return twts print('\twill attempt updating cache') updateTwts = retryParallelTwtsExt( newReqURIs, maxRetryCount=config['maxRetryCount'], tweetConvMaxTweetCount=config['tweetConvMaxTweetCount'], maxNoMoreTweetCounter=config['maxNoMoreTweetCounter'], chromedriverPath=config['chromedriverPath'], extraParams=config) if (len(updateTwts) != 0): twts = twts + updateTwts dumpJsonToFile(cacheFilename, twts, indentFlag=False) return twts
def writeReport(self, indentFlag=False): if (self.health): dumpJsonToFile(self.reportFilename, self.cols, indentFlag=indentFlag)
def extractCols(self): allSegmentedCols = { 'name': 'all', 'timestamp': getNowTime(), "extraction-timestamp": getNowTime(), 'segmented-cols': { 'ss': [], 'sm': [], 'ms': [], 'mm': [], 'mc': [] } } print('\nextractCols() - start') for i in range(len(self.cols['sources'])): src = self.cols['sources'][i] print('\tsrc/active:', src['name'], src['active']) src.setdefault('config', {}) #tranfer generic plot config to local src config src['config']['generic'] = self.cols['generic'] if (src['active'] == False and 'id' in src): self.cols['sources'][i]['output'] = self.getColFromCache( src['name'], src['id']) if (src['name'] == 'reddit'): if (src['active']): self.cols['sources'][i]['output'] = redditSearchExpand( src['query'], maxPages=src['config']['maxPages'], extraParams=src['config']) if (src['config']['expandDegree1Comments']): #allow update (saving source in cache) of source when deg 1 links have been explored src['active'] = True SegmentCols.genericAddReplyGroup( self.cols['sources'][i]['output'], SegmentCols.redditAuthorComp) segCols = SegmentCols.redditSegmentCols( self.cols['sources'][i]['output'], self.cols['collectionTopic'], src['id'], src['config']['sort'], extraParams=src['config']) ExtractMicroCol.cpAllSegCols( allSegmentedCols['segmented-cols'], segCols, src['name']) segOutfilename = './Caches/SegmentedCols/' + self.cols[ 'collectionTopic'] + '/' + src['id'] + '.json' dumpJsonToFile(segOutfilename, segCols, indentFlag=True) elif (src['name'] == 'wikipedia'): if (src['active']): self.cols['sources'][i][ 'output'] = wikipediaGetExternalLinksDictFromPage( src['uri']) elif (src['name'] == 'twitter-serp'): if (src['active']): self.cols['sources'][i]['output'] = self.genTwitterCols( src['name'], src['config']) segCols = SegmentCols.twitterSegmentCols( self.cols['sources'][i]['output'], self.cols['collectionTopic'], src['id'], src['config']['vertical'], extraParams=src['config']) ExtractMicroCol.cpAllSegCols( allSegmentedCols['segmented-cols'], segCols, src['name']) elif (src['name'] == 'twitter-moments'): if (src['active']): self.cols['sources'][i]['output'] = self.genTwitterMoments( src['name'], src['uris'], src['query']) segCols = SegmentCols.twitterMomentsSegmentCols( self.cols['sources'][i]['output'], self.cols['collectionTopic'], src['id'], extraParams=src['config']) ExtractMicroCol.cpAllSegCols( allSegmentedCols['segmented-cols'], segCols, src['name']) elif (src['name'] == 'facebook'): print('\tFB is off') ''' if( src['active'] ): self.cols['sources'][i]['output'] = self.genFacebookCols( src['name'], src['config'] ) SegmentCols.genericAddReplyGroup( self.cols['sources'][i]['output'], SegmentCols.facebookAuthorComp ) SegmentCols.facebookSegmentCols(self.cols['sources'][i]['output'], self.cols['collectionTopic'], src['id'], extraParams=src['config']) ''' elif (src['name'] == 'scoopit'): if (src['active']): self.cols['sources'][i]['output'] = self.genScoopItCols( src['config']) segCols = SegmentCols.scoopitSegmentCols( self.cols['sources'][i]['output'], self.cols['collectionTopic'], src['id'], extraParams=src['config']) ExtractMicroCol.cpAllSegCols( allSegmentedCols['segmented-cols'], segCols, src['name']) elif (src['name'] == 'sutori'): if (src['active']): self.cols['sources'][i]['output'] = self.genSutoriCols( src['query'], src['config']) segCols = SegmentCols.sutoriSegmentCols( self.cols['sources'][i]['output'], self.cols['collectionTopic'], src['id'], extraParams=src['config']) ExtractMicroCol.cpAllSegCols( allSegmentedCols['segmented-cols'], segCols, src['name']) elif (src['name'] == 'all'): SegmentCols.genericSegmentCols(allSegmentedCols, self.cols['collectionTopic'], src['id'], extraParams=src['config']) #save src in cache - start if ('output' in self.cols['sources'][i] and src['active']): if (len(self.cols['sources'][i]['output']) != 0): tmpSrcCacheFilename = './Caches/Sources/' + self.cols[ 'collectionTopic'] + '/' + src['id'] + '.json' dumpJsonToFile(tmpSrcCacheFilename, self.cols['sources'][i]['output']) #save src in cache - end print() print('extractCols() - end\n') return self.cols
def getHTMLAndTextForURILst(col, outfilename=None, printSuffix='', extraParams=None): if (extraParams is None): extraParams = {} extraParams.setdefault('simCacheLookup', True) jobsLst = [] statusCodeJobsLst = [] jobSize = len(col['uris']) for i in range(jobSize): uri = col['uris'][i] if ('hash' not in uri): uri['hash'] = getURIHash(uri['uri']) if (PrecEval.uriDctHasBasics(uri) and extraParams['simCacheLookup']): #ignore already proc. files, usually already proc. segments #except cache lookup is off continue #attempt - cache - start cosineSimFile = './Caches/CosineSim/' + col['uris'][i][ 'hash'] + '.json' if (os.path.exists(cosineSimFile) and extraParams['simCacheLookup']): cache = getDictFromFile(cosineSimFile) if (PrecEval.uriDctHasBasics(cache)): uri['text'] = cache['text'] uri['text-len'] = cache['text-len'] uri['title'] = cache['title'] uri['status-code'] = cache['status-code'] #print('\t\tskipping since cache available') continue if ('custom' in uri): if ('mime' in uri['custom']): if (uri['custom']['mime'] != 'text/html'): print('\tskipping', uri['custom']['mime']) uri['text'] = 'NoneHTML' uri['text-len'] = 8 uri.setdefault('title', '') uri.setdefault('status-code', -1) continue ''' txtFile = './Caches/Plaintext/' + uri['hash'] + '.txt' htmlFile = './Caches/HTML/' + uri['hash'] + '.html' if( os.path.exists(txtFile) ): uri['text'] = readTextFromFile(txtFile) uri['text-len'] = len(uri['text']) uri['title'] = extractPageTitleFromHTML( readTextFromFile(htmlFile) ) continue ''' #attempt - cache - end jobsLst.append({ 'func': mimicBrowser, 'args': { 'uri': uri['uri'], 'extraParams': { 'sizeRestrict': 4000000 } }, 'misc': { 'i': i, 'hash': uri['hash'] }, 'print': 'gtHTML.URILst->dfURI(): ' + str(i) + ' of ' + str(jobSize) + printSuffix #+ '\n\tu: ' + uri['uri'] }) statusCodeJobsLst.append({ 'func': mimicBrowser, 'args': { 'uri': uri['uri'], 'getRequestFlag': False, 'extraParams': None }, 'misc': { 'i': i, 'hash': uri['hash'] }, 'print': 'gtHTML.URILst->mkHdReq.(): ' + str(i) + ' of ' + str(jobSize) + printSuffix }) resLst = [] if (len(jobsLst) != 0): resLst = parallelTask(jobsLst, threadCount=3) for res in resLst: html = res['output'] plaintext = clean_html(html) indx = res['misc']['i'] col['uris'][indx]['text'] = plaintext col['uris'][indx]['text-len'] = len(plaintext) col['uris'][indx]['title'] = extractPageTitleFromHTML(html) writeTextToFile('./Caches/HTML/' + res['misc']['hash'] + '.html', html) print('\t\thtmllen:', len(html)) writeTextToFile( './Caches/Plaintext/' + res['misc']['hash'] + '.txt', plaintext) print('\t\tplaintextlen:', len(plaintext)) resLst = [] if (len(statusCodeJobsLst) != 0): resLst = parallelTask(statusCodeJobsLst, threadCount=3) for res in resLst: headReq = res['output'] indx = res['misc']['i'] cache = {} cache['text'] = col['uris'][indx]['text'] cache['text-len'] = col['uris'][indx]['text-len'] cache['title'] = col['uris'][indx]['title'] cache['status-code'] = -1 col['uris'][indx]['status-code'] = -1 if ('status-code' in headReq): cache['status-code'] = headReq['status-code'] col['uris'][indx]['status-code'] = headReq['status-code'] cacheFilename = './Caches/CosineSim/' + res['misc'][ 'hash'] + '.json' dumpJsonToFile(cacheFilename, cache) col['timestamp'] = getNowTime() if (outfilename is not None): dumpJsonToFile(outfilename, col)
def prlEvalCol(col, goldstandard, removeTxt=True, extraParams=None): if (extraParams is None): extraParams = {} extraParams.setdefault('minTextSize', 300) ''' Important note: 1. If minTextSize is changed, If gold standard text content is change, set simCacheLookup False avoid cache lookup in order to true to recalculate sim 2. If gold standard sim-coeff is change, no need to do anything ''' extraParams.setdefault('simCacheLookup', True) extraParams.setdefault('printSuffix', '') colsize = len(col['uris']) if (colsize == 0 or len(goldstandard) == 0): print( '\tprlEvalCol(): colsize is 0 or goldstandard == 0, returning') return -1 if ('uris' not in goldstandard): print('\tprlEvalCol(): no uris in goldstandard, returning') return -1 goldRange = list(range(len(goldstandard['uris']))) combinedGold = PrecEval.combineDocsForIndices(goldstandard['uris'], goldRange) precision = 0 validColSize = 0 jobsLst = [] for i in range(colsize): #attempt getting sim from cache - start cosineSimFile = './Caches/CosineSim/' + col['uris'][i][ 'hash'] + '.json' if (os.path.exists(cosineSimFile) and extraParams['simCacheLookup']): cosSim = getDictFromFile(cosineSimFile) if ('sim' in cosSim): col['uris'][i]['sim'] = cosSim['sim'] if (cosSim['sim'] != -1): validColSize += 1 if (PrecEval.isRel(cosSim['sim'], goldstandard['sim-coeff'])): col['uris'][i]['relevant'] = True precision += 1 else: col['uris'][i]['relevant'] = False continue #attempt getting sim from cache - end noopFlag = False usingSubText = '' if (len(col['uris'][i]['text']) < extraParams['minTextSize']): if ('post-details' in col['uris'][i]): #gold standards do not have post-details if ('substitute-text' in col['uris'][i]['post-details']): subText = col['uris'][i]['post-details'][ 'substitute-text'].strip() if (subText != ''): col['uris'][i]['text'] = subText col['uris'][i]['custom'][ 'substitute-text-active'] = True usingSubText = '\n\t\tusing subtext: ' + col[ 'uris'][i]['uri'] else: noopFlag = True else: #don't process uris with small text #don't skip (continue) so cache can update noopFlag = True matrix = [col['uris'][i]['text'], combinedGold] keywords = {'matrix': matrix, 'noopFlag': noopFlag} toPrint = '\tprlEvalCol():' + str(i) + ' of ' + str( colsize) + ' ' + extraParams['printSuffix'] + usingSubText if ('status-code' not in col['uris'][i]): print('\tproblem ahead for uri:', col['uris'][i]['uri']) print('\tproblem ahead for hash:', col['uris'][i]['hash']) print('\tproblem ahead for cosineSimFile:', cosineSimFile) print('\tproblem ahead for keys:', col['uris'][i].keys()) cache = { 'hash': col['uris'][i]['hash'], 'self': cosineSimFile, 'uri': col['uris'][i]['uri'], 'title': col['uris'][i]['title'], 'text': col['uris'][i]['text'], 'text-len': len(col['uris'][i]['text']), 'status-code': col['uris'][i]['status-code'] } jobsLst.append({ 'func': PrecEval.calcPairSim, 'args': keywords, 'misc': { 'i': i, 'cache': cache }, 'print': toPrint }) resLst = [] if (len(jobsLst) != 0): resLst = parallelTask(jobsLst, threadCount=3) for res in resLst: indx = res['misc']['i'] cache = res['misc']['cache'] sim = res['output'] col['uris'][indx]['sim'] = sim if (sim != -1): validColSize += 1 if (PrecEval.isRel(sim, goldstandard['sim-coeff'])): col['uris'][indx]['relevant'] = True precision += 1 else: col['uris'][indx]['relevant'] = False #write cache - start cache['sim'] = sim dumpJsonToFile(cache['self'], cache) #write cache - end if (removeTxt): for i in range(colsize): if ('text' in col['uris'][i]): del col['uris'][i]['text'] if (validColSize > 0): return precision / validColSize else: return -1
def updateGoldstandard(self): self.goldstandard['timestamp'] = getNowTime() dumpJsonToFile(self.goldstandardFilename, self.goldstandard)