def genExplThreadsCol(threads, config, cacheFilename, expThreadFlag=True): if (len(threads) == 0): return {} prevNow = datetime.now() twts = getDictFromFile(cacheFilename) if (len(twts) == 0): twts = retryParallelTwtsExt( threads, maxRetryCount=config['maxRetryCount'], tweetConvMaxTweetCount=config['tweetConvMaxTweetCount'], maxNoMoreTweetCounter=config['maxNoMoreTweetCounter'], chromedriverPath=config['chromedriverPath'], extraParams=config) if (len(twts) != 0): dumpJsonToFile(cacheFilename, twts, indentFlag=False) else: print('\ngenExplThreadsCol(): read tweets from cache:', cacheFilename) delta = datetime.now() - prevNow twts = updateCache(threads, config, cacheFilename, expThreadFlag, twts) twts = normalizeCol(twts, expThreadFlag) return twts
def genFacebookCols(self, name, settings): print('\ngenFacebookCols():') filename = settings['inputFileWithPosts'].split('/')[-1].replace( '.json', '') fbCol = getDictFromFile(settings['inputFileWithPosts']) fbCol = socMedGenCol.normalizeCol(fbCol) return fbCol
def __init__(self, configFilename): print('\nExtractMicroCol::init() - start') reportFilename = configFilename.replace('config.json', 'report.json') print('\tconfig:', configFilename) print('\treport:', reportFilename) self.cols = getDictFromFile(configFilename) self.cache = getDictFromFile(reportFilename) self.reportFilename = reportFilename self.health = False if ('sources' in self.cols and 'collectionTopic' in self.cols): createFolderAtPath('./Caches/Deg1Twttr/' + self.cols['collectionTopic']) createFolderAtPath('./Caches/ExpTwttrThreads/' + self.cols['collectionTopic']) createFolderAtPath('./Caches/ImpTwttrThreads/' + self.cols['collectionTopic']) createFolderAtPath('./Caches/Sources/' + self.cols['collectionTopic']) createFolderAtPath('./Caches/Tweets/' + self.cols['collectionTopic']) createFolderAtPath('./Caches/SegmentedCols/' + self.cols['collectionTopic']) createFolderAtPath('./Caches/ShortURIs/' + self.cols['collectionTopic']) createFolderAtPath('./Caches/Plots/' + self.cols['collectionTopic']) createFolderAtPath('./Caches/CDFs/' + self.cols['collectionTopic']) print('\tsources:', len(self.cols['sources'])) for src in self.cols['sources']: print('\t\t', src['name'], 'active:', src['active']) self.cols self.extractCols() self.health = True else: print('\tmissing config') print('ExtractMicroCol::init() - end\n')
def main(inFilenamePath, config): print('\nmain()') if (len(config) == 0): return filename = inFilenamePath.split('/')[-1].replace('.json', '') col = getDictFromFile('./' + inFilenamePath) col, metadata = normalizeCol(col) genSerpOrThreadsCol(col, filename, source=metadata['source']) threads = getThreads(col, config['maxThreadsToExtract']) genThreadsCol(threads, config, filename)
def __init__(self, goldstandardFilename): self.goldstandardFilename = goldstandardFilename self.goldstandard = getDictFromFile(goldstandardFilename) self.simCoeff = -1 if ('sim-coeff' in self.goldstandard): self.simCoeff = self.goldstandard['sim-coeff'] elif ('uris' in self.goldstandard): PrecEval.getHTMLAndTextForURILst(self.goldstandard, self.goldstandardFilename) self.setSimCoeff() #parallel version did not achieve decent speedup #self.prlSetSimCoeff() else: print('\tInvalid goldstandard supplied') print('\t', self.goldstandard)
def getColFromCache(self, colname, id): if (len(self.cache) == 0): return {} singleSrcFilename = './Caches/Sources/' + self.cols[ 'collectionTopic'] + '/' + id + '.json' src = getDictFromFile(singleSrcFilename) if (len(src) != 0): print('\tgetColFromCache():', id, 'HIT 1') return src for src in self.cache['sources']: if (src['name'] == colname and src['id'] == id): if ('output' in src): if (len(src['output']) != 0): print('\tgetColFromCache():', src['name'], 'HIT 2') return src['output'] return {}
def main(goldFilname, testFilename=None): goldFilname = goldFilname.strip() if (len(goldFilname) == 0): return prevNow = datetime.now() goldstandard = PrecEval(goldFilname) if (testFilename is not None): tstFile = getDictFromFile(testFilename) PrecEval.getHTMLAndTextForURILst(tstFile, testFilename) tstFile['timestamp'] = getNowTime() tstFile['predicted-precision'] = PrecEval.prlEvalCol( tstFile, goldstandard.goldstandard, removeTxt=False) tstFile['sim-coeff'] = goldstandard.simCoeff dumpJsonToFile(testFilename, tstFile) delta = datetime.now() - prevNow print('\tdelta seconds:', delta.seconds)
def main(segF): segment = getDictFromFile(segF) if (len(segment) == 0): print('\tSegment is corrupt, returning') return print('\ncarbon date segment():') prevNow = datetime.now() excludeDomains = ['archive.is'] ignoreEmptyFiles = False cacheOnInd = -1 #-1 to switch off progress = 0 for seg in ['ss', 'ms', 'mm', 'mc']: if (seg == 'mc'): continue jobsLst = [] segSize = len(segment['segmented-cols'][seg]) for i in range(segSize): carbonDateServerStartStop('start') uriSize = len(segment['segmented-cols'][seg][i]['uris']) for j in range(uriSize): progress += 1 uri = segment['segmented-cols'][seg][i]['uris'][j] uriSeg = getURISeg(uri) #skip carbon dating non-relevant uris for now if ('relevant' in uri): if (uri['relevant'] == False): continue dom = getDomain(uri['uri']) if (dom in excludeDomains): print('\texcluding:', dom) continue html = '' uriHash = getURIHash(uriSeg['long']) htmlFile = './Caches/HTML/' + uriHash + '.html' outfilename = './Caches/CD/' + uriHash + '.txt' altOutfilename = '' if (uriSeg['short'] != ''): altOutfilename = './Caches/CD/' + getURIHash( uriSeg['short']) + '.txt' if (os.path.exists(outfilename)): pubDate = readTextFromFile(outfilename) pubdate = pubDate.strip() if (pubdate == ''): if (ignoreEmptyFiles): continue else: continue toPrint = '\tseg: ' + seg + ' ' + str(j) + ' of ' + str( uriSize) + ', ' + str(i) + ' of ' + str( segSize) + ', p: ' + str(progress) #print(toPrint) if (progress < cacheOnInd): print('\tskipping', progress) continue html = '' if (os.path.exists(htmlFile)): html = readTextFromFile(htmlFile) keywords = { 'uri': uriSeg['long'], 'html': html, 'outfilename': outfilename, 'altOutfilename': altOutfilename } jobsLst.append({ 'func': getPubDate, 'args': keywords, 'misc': False, 'print': toPrint }) resLst = [] jobCount = len(jobsLst) if (jobCount != 0): print('jobsLst:', jobCount) resLst = parallelTask(jobsLst, threadCount=3) delta = datetime.now() - prevNow print('\tdelta seconds:', delta.seconds)
def genTwitterCols(self, name, settings): print('\ngenTwitterCols():') output = { 'serp': {}, 'explicit-thread-cols': {}, 'implicit-thread-cols': {}, 'serp-heuristics': {} } filename = settings['inputFileWithTweets'].split('/')[-1].replace( '.json', '') twtCol = getDictFromFile(settings['inputFileWithTweets']) twtCol = socMedGenCol.normalizeCol(twtCol) output['serp'] = twtCol threadOptions = { 'extractExpThreadCol': { 'input': 'explicit-thread-links', 'output': 'explicit-thread-cols', 'trim': 'maxExpThreadToExplore', 'expFlag': True }, 'extractImpThreadCol': { 'input': 'possible-implicit-thread-links', 'output': 'implicit-thread-cols', 'trim': 'maxImpThreadToExplore', 'expFlag': False } } for threadOption, params in threadOptions.items(): if (settings[threadOption]): trim = settings[params['trim']] print('\t' + threadOption + ' count:', len(twtCol['payload'][params['input']])) print('\t\twill extract max:', trim) expOrImp = params['output'][:3] expOrImp = expOrImp[0].upper() + expOrImp[1:] cacheFilename = './Caches/' + expOrImp + 'TwttrThreads/' + self.cols[ 'collectionTopic'] + '/' + 'threads.json' output[params['output']] = socMedGenCol.genExplThreadsCol( twtCol['payload'][params['input']][:trim], settings, cacheFilename, expThreadFlag=params['expFlag']) else: print('\t', threadOption, 'off, will read report cache') if ('id' in settings): cache = self.getColFromCache(name, settings['id']) if (params['output'] in cache): output[params['output']] = cache[params['output']] else: print('\tcan\'t read cache, id not provided') #add degree 1 serp col - start print('\t adding degree 1 cols') deg1Settings = self.cols['degree-1-twt-cols'] dedupSet = set() if ('tweets' in output['serp']['payload']): #don't add a tweet already explored in degree 2 for twt in output['serp']['payload']['tweets']: dedupSet.add(twt['data-tweet-id']) output['degree-1-twt-col'] = [{'name': name, 'tweet-links': []}] ExtractMicroCol.addDegree1TwtLinks( output['serp']['payload']['tweets'], output['degree-1-twt-col'][-1]['tweet-links'], dedupSet) print('\tdeg 1', name, 'active:', deg1Settings['active'][name]) if (deg1Settings['active'][name]): cacheFilename = './Caches/Deg1Twttr/' + self.cols[ 'collectionTopic'] + '/' + name + '.json' ExtractMicroCol.addTwDeg1Col( name, output['degree-1-twt-col'][-1]['tweet-links'], cacheFilename, deg1Settings) for colOpt in ['explicit-thread-cols', 'implicit-thread-cols']: if ('thread-cols' in output[colOpt]): threadTypeName = name + '-' + colOpt[:3] + '-threads' output['degree-1-twt-col'].append({ 'name': threadTypeName, 'tweet-links': [] }) for threadCol in output[colOpt]['thread-cols']: if ('tweets' not in threadCol): continue ExtractMicroCol.addDegree1TwtLinks( threadCol['tweets'], output['degree-1-twt-col'][-1]['tweet-links'], dedupSet) print('\tdeg 1', threadTypeName, 'active:', deg1Settings['active'][threadTypeName]) cacheFilename = './Caches/Deg1Twttr/' + self.cols[ 'collectionTopic'] + '/' + threadTypeName + '.json' ExtractMicroCol.addTwDeg1Col( threadTypeName, output['degree-1-twt-col'][-1]['tweet-links'], cacheFilename, deg1Settings) #add degree 1 serp col - end return output
def getHTMLAndTextForURILst(col, outfilename=None, printSuffix='', extraParams=None): if (extraParams is None): extraParams = {} extraParams.setdefault('simCacheLookup', True) jobsLst = [] statusCodeJobsLst = [] jobSize = len(col['uris']) for i in range(jobSize): uri = col['uris'][i] if ('hash' not in uri): uri['hash'] = getURIHash(uri['uri']) if (PrecEval.uriDctHasBasics(uri) and extraParams['simCacheLookup']): #ignore already proc. files, usually already proc. segments #except cache lookup is off continue #attempt - cache - start cosineSimFile = './Caches/CosineSim/' + col['uris'][i][ 'hash'] + '.json' if (os.path.exists(cosineSimFile) and extraParams['simCacheLookup']): cache = getDictFromFile(cosineSimFile) if (PrecEval.uriDctHasBasics(cache)): uri['text'] = cache['text'] uri['text-len'] = cache['text-len'] uri['title'] = cache['title'] uri['status-code'] = cache['status-code'] #print('\t\tskipping since cache available') continue if ('custom' in uri): if ('mime' in uri['custom']): if (uri['custom']['mime'] != 'text/html'): print('\tskipping', uri['custom']['mime']) uri['text'] = 'NoneHTML' uri['text-len'] = 8 uri.setdefault('title', '') uri.setdefault('status-code', -1) continue ''' txtFile = './Caches/Plaintext/' + uri['hash'] + '.txt' htmlFile = './Caches/HTML/' + uri['hash'] + '.html' if( os.path.exists(txtFile) ): uri['text'] = readTextFromFile(txtFile) uri['text-len'] = len(uri['text']) uri['title'] = extractPageTitleFromHTML( readTextFromFile(htmlFile) ) continue ''' #attempt - cache - end jobsLst.append({ 'func': mimicBrowser, 'args': { 'uri': uri['uri'], 'extraParams': { 'sizeRestrict': 4000000 } }, 'misc': { 'i': i, 'hash': uri['hash'] }, 'print': 'gtHTML.URILst->dfURI(): ' + str(i) + ' of ' + str(jobSize) + printSuffix #+ '\n\tu: ' + uri['uri'] }) statusCodeJobsLst.append({ 'func': mimicBrowser, 'args': { 'uri': uri['uri'], 'getRequestFlag': False, 'extraParams': None }, 'misc': { 'i': i, 'hash': uri['hash'] }, 'print': 'gtHTML.URILst->mkHdReq.(): ' + str(i) + ' of ' + str(jobSize) + printSuffix }) resLst = [] if (len(jobsLst) != 0): resLst = parallelTask(jobsLst, threadCount=3) for res in resLst: html = res['output'] plaintext = clean_html(html) indx = res['misc']['i'] col['uris'][indx]['text'] = plaintext col['uris'][indx]['text-len'] = len(plaintext) col['uris'][indx]['title'] = extractPageTitleFromHTML(html) writeTextToFile('./Caches/HTML/' + res['misc']['hash'] + '.html', html) print('\t\thtmllen:', len(html)) writeTextToFile( './Caches/Plaintext/' + res['misc']['hash'] + '.txt', plaintext) print('\t\tplaintextlen:', len(plaintext)) resLst = [] if (len(statusCodeJobsLst) != 0): resLst = parallelTask(statusCodeJobsLst, threadCount=3) for res in resLst: headReq = res['output'] indx = res['misc']['i'] cache = {} cache['text'] = col['uris'][indx]['text'] cache['text-len'] = col['uris'][indx]['text-len'] cache['title'] = col['uris'][indx]['title'] cache['status-code'] = -1 col['uris'][indx]['status-code'] = -1 if ('status-code' in headReq): cache['status-code'] = headReq['status-code'] col['uris'][indx]['status-code'] = headReq['status-code'] cacheFilename = './Caches/CosineSim/' + res['misc'][ 'hash'] + '.json' dumpJsonToFile(cacheFilename, cache) col['timestamp'] = getNowTime() if (outfilename is not None): dumpJsonToFile(outfilename, col)
def prlEvalCol(col, goldstandard, removeTxt=True, extraParams=None): if (extraParams is None): extraParams = {} extraParams.setdefault('minTextSize', 300) ''' Important note: 1. If minTextSize is changed, If gold standard text content is change, set simCacheLookup False avoid cache lookup in order to true to recalculate sim 2. If gold standard sim-coeff is change, no need to do anything ''' extraParams.setdefault('simCacheLookup', True) extraParams.setdefault('printSuffix', '') colsize = len(col['uris']) if (colsize == 0 or len(goldstandard) == 0): print( '\tprlEvalCol(): colsize is 0 or goldstandard == 0, returning') return -1 if ('uris' not in goldstandard): print('\tprlEvalCol(): no uris in goldstandard, returning') return -1 goldRange = list(range(len(goldstandard['uris']))) combinedGold = PrecEval.combineDocsForIndices(goldstandard['uris'], goldRange) precision = 0 validColSize = 0 jobsLst = [] for i in range(colsize): #attempt getting sim from cache - start cosineSimFile = './Caches/CosineSim/' + col['uris'][i][ 'hash'] + '.json' if (os.path.exists(cosineSimFile) and extraParams['simCacheLookup']): cosSim = getDictFromFile(cosineSimFile) if ('sim' in cosSim): col['uris'][i]['sim'] = cosSim['sim'] if (cosSim['sim'] != -1): validColSize += 1 if (PrecEval.isRel(cosSim['sim'], goldstandard['sim-coeff'])): col['uris'][i]['relevant'] = True precision += 1 else: col['uris'][i]['relevant'] = False continue #attempt getting sim from cache - end noopFlag = False usingSubText = '' if (len(col['uris'][i]['text']) < extraParams['minTextSize']): if ('post-details' in col['uris'][i]): #gold standards do not have post-details if ('substitute-text' in col['uris'][i]['post-details']): subText = col['uris'][i]['post-details'][ 'substitute-text'].strip() if (subText != ''): col['uris'][i]['text'] = subText col['uris'][i]['custom'][ 'substitute-text-active'] = True usingSubText = '\n\t\tusing subtext: ' + col[ 'uris'][i]['uri'] else: noopFlag = True else: #don't process uris with small text #don't skip (continue) so cache can update noopFlag = True matrix = [col['uris'][i]['text'], combinedGold] keywords = {'matrix': matrix, 'noopFlag': noopFlag} toPrint = '\tprlEvalCol():' + str(i) + ' of ' + str( colsize) + ' ' + extraParams['printSuffix'] + usingSubText if ('status-code' not in col['uris'][i]): print('\tproblem ahead for uri:', col['uris'][i]['uri']) print('\tproblem ahead for hash:', col['uris'][i]['hash']) print('\tproblem ahead for cosineSimFile:', cosineSimFile) print('\tproblem ahead for keys:', col['uris'][i].keys()) cache = { 'hash': col['uris'][i]['hash'], 'self': cosineSimFile, 'uri': col['uris'][i]['uri'], 'title': col['uris'][i]['title'], 'text': col['uris'][i]['text'], 'text-len': len(col['uris'][i]['text']), 'status-code': col['uris'][i]['status-code'] } jobsLst.append({ 'func': PrecEval.calcPairSim, 'args': keywords, 'misc': { 'i': i, 'cache': cache }, 'print': toPrint }) resLst = [] if (len(jobsLst) != 0): resLst = parallelTask(jobsLst, threadCount=3) for res in resLst: indx = res['misc']['i'] cache = res['misc']['cache'] sim = res['output'] col['uris'][indx]['sim'] = sim if (sim != -1): validColSize += 1 if (PrecEval.isRel(sim, goldstandard['sim-coeff'])): col['uris'][indx]['relevant'] = True precision += 1 else: col['uris'][indx]['relevant'] = False #write cache - start cache['sim'] = sim dumpJsonToFile(cache['self'], cache) #write cache - end if (removeTxt): for i in range(colsize): if ('text' in col['uris'][i]): del col['uris'][i]['text'] if (validColSize > 0): return precision / validColSize else: return -1