Exemple #1
0
def genExplThreadsCol(threads, config, cacheFilename, expThreadFlag=True):

    if (len(threads) == 0):
        return {}

    prevNow = datetime.now()

    twts = getDictFromFile(cacheFilename)
    if (len(twts) == 0):

        twts = retryParallelTwtsExt(
            threads,
            maxRetryCount=config['maxRetryCount'],
            tweetConvMaxTweetCount=config['tweetConvMaxTweetCount'],
            maxNoMoreTweetCounter=config['maxNoMoreTweetCounter'],
            chromedriverPath=config['chromedriverPath'],
            extraParams=config)

        if (len(twts) != 0):
            dumpJsonToFile(cacheFilename, twts, indentFlag=False)
    else:
        print('\ngenExplThreadsCol(): read tweets from cache:', cacheFilename)

    delta = datetime.now() - prevNow
    twts = updateCache(threads, config, cacheFilename, expThreadFlag, twts)
    twts = normalizeCol(twts, expThreadFlag)

    return twts
def main(goldFilname, testFilename=None):

    goldFilname = goldFilname.strip()
    if (len(goldFilname) == 0):
        return

    prevNow = datetime.now()
    goldstandard = PrecEval(goldFilname)

    if (testFilename is not None):
        tstFile = getDictFromFile(testFilename)
        PrecEval.getHTMLAndTextForURILst(tstFile, testFilename)

        tstFile['timestamp'] = getNowTime()
        tstFile['predicted-precision'] = PrecEval.prlEvalCol(
            tstFile, goldstandard.goldstandard, removeTxt=False)
        tstFile['sim-coeff'] = goldstandard.simCoeff
        dumpJsonToFile(testFilename, tstFile)

    delta = datetime.now() - prevNow
    print('\tdelta seconds:', delta.seconds)
Exemple #3
0
def updateCache(threads, config, cacheFilename, expThreadFlag, twts):

    print('\nupdateCache')

    if (len(twts) == 0 or len(threads) == 0):
        print('\tempty returning:', len(twts), len(threads))
        return twts

    cacheURIs = {}
    newReqURIs = []

    for twtCol in twts:
        cacheURIs[twtCol['self']] = True

    for uri in threads:
        if (uri not in cacheURIs):
            newReqURIs.append(uri)

    print('\tnew uris:', len(newReqURIs))
    if (len(newReqURIs) == 0):
        return twts
    print('\twill attempt updating cache')

    updateTwts = retryParallelTwtsExt(
        newReqURIs,
        maxRetryCount=config['maxRetryCount'],
        tweetConvMaxTweetCount=config['tweetConvMaxTweetCount'],
        maxNoMoreTweetCounter=config['maxNoMoreTweetCounter'],
        chromedriverPath=config['chromedriverPath'],
        extraParams=config)

    if (len(updateTwts) != 0):
        twts = twts + updateTwts
        dumpJsonToFile(cacheFilename, twts, indentFlag=False)

    return twts
Exemple #4
0
 def writeReport(self, indentFlag=False):
     if (self.health):
         dumpJsonToFile(self.reportFilename,
                        self.cols,
                        indentFlag=indentFlag)
Exemple #5
0
    def extractCols(self):

        allSegmentedCols = {
            'name': 'all',
            'timestamp': getNowTime(),
            "extraction-timestamp": getNowTime(),
            'segmented-cols': {
                'ss': [],
                'sm': [],
                'ms': [],
                'mm': [],
                'mc': []
            }
        }

        print('\nextractCols() - start')

        for i in range(len(self.cols['sources'])):

            src = self.cols['sources'][i]
            print('\tsrc/active:', src['name'], src['active'])

            src.setdefault('config', {})
            #tranfer generic plot config to local src config
            src['config']['generic'] = self.cols['generic']

            if (src['active'] == False and 'id' in src):
                self.cols['sources'][i]['output'] = self.getColFromCache(
                    src['name'], src['id'])

            if (src['name'] == 'reddit'):

                if (src['active']):
                    self.cols['sources'][i]['output'] = redditSearchExpand(
                        src['query'],
                        maxPages=src['config']['maxPages'],
                        extraParams=src['config'])

                if (src['config']['expandDegree1Comments']):
                    #allow update (saving source in cache) of source when deg 1 links have been explored
                    src['active'] = True

                SegmentCols.genericAddReplyGroup(
                    self.cols['sources'][i]['output'],
                    SegmentCols.redditAuthorComp)
                segCols = SegmentCols.redditSegmentCols(
                    self.cols['sources'][i]['output'],
                    self.cols['collectionTopic'],
                    src['id'],
                    src['config']['sort'],
                    extraParams=src['config'])
                ExtractMicroCol.cpAllSegCols(
                    allSegmentedCols['segmented-cols'], segCols, src['name'])

                segOutfilename = './Caches/SegmentedCols/' + self.cols[
                    'collectionTopic'] + '/' + src['id'] + '.json'
                dumpJsonToFile(segOutfilename, segCols, indentFlag=True)

            elif (src['name'] == 'wikipedia'):

                if (src['active']):
                    self.cols['sources'][i][
                        'output'] = wikipediaGetExternalLinksDictFromPage(
                            src['uri'])

            elif (src['name'] == 'twitter-serp'):

                if (src['active']):
                    self.cols['sources'][i]['output'] = self.genTwitterCols(
                        src['name'], src['config'])

                segCols = SegmentCols.twitterSegmentCols(
                    self.cols['sources'][i]['output'],
                    self.cols['collectionTopic'],
                    src['id'],
                    src['config']['vertical'],
                    extraParams=src['config'])
                ExtractMicroCol.cpAllSegCols(
                    allSegmentedCols['segmented-cols'], segCols, src['name'])

            elif (src['name'] == 'twitter-moments'):

                if (src['active']):
                    self.cols['sources'][i]['output'] = self.genTwitterMoments(
                        src['name'], src['uris'], src['query'])

                segCols = SegmentCols.twitterMomentsSegmentCols(
                    self.cols['sources'][i]['output'],
                    self.cols['collectionTopic'],
                    src['id'],
                    extraParams=src['config'])
                ExtractMicroCol.cpAllSegCols(
                    allSegmentedCols['segmented-cols'], segCols, src['name'])

            elif (src['name'] == 'facebook'):

                print('\tFB is off')
                '''
					if( src['active'] ):
						self.cols['sources'][i]['output'] = self.genFacebookCols( src['name'], src['config'] )
					
					SegmentCols.genericAddReplyGroup( self.cols['sources'][i]['output'], SegmentCols.facebookAuthorComp )
					SegmentCols.facebookSegmentCols(self.cols['sources'][i]['output'], self.cols['collectionTopic'], src['id'], extraParams=src['config'])
				'''
            elif (src['name'] == 'scoopit'):

                if (src['active']):
                    self.cols['sources'][i]['output'] = self.genScoopItCols(
                        src['config'])

                segCols = SegmentCols.scoopitSegmentCols(
                    self.cols['sources'][i]['output'],
                    self.cols['collectionTopic'],
                    src['id'],
                    extraParams=src['config'])
                ExtractMicroCol.cpAllSegCols(
                    allSegmentedCols['segmented-cols'], segCols, src['name'])

            elif (src['name'] == 'sutori'):

                if (src['active']):
                    self.cols['sources'][i]['output'] = self.genSutoriCols(
                        src['query'], src['config'])

                segCols = SegmentCols.sutoriSegmentCols(
                    self.cols['sources'][i]['output'],
                    self.cols['collectionTopic'],
                    src['id'],
                    extraParams=src['config'])
                ExtractMicroCol.cpAllSegCols(
                    allSegmentedCols['segmented-cols'], segCols, src['name'])

            elif (src['name'] == 'all'):
                SegmentCols.genericSegmentCols(allSegmentedCols,
                                               self.cols['collectionTopic'],
                                               src['id'],
                                               extraParams=src['config'])

            #save src in cache - start
            if ('output' in self.cols['sources'][i] and src['active']):
                if (len(self.cols['sources'][i]['output']) != 0):
                    tmpSrcCacheFilename = './Caches/Sources/' + self.cols[
                        'collectionTopic'] + '/' + src['id'] + '.json'
                    dumpJsonToFile(tmpSrcCacheFilename,
                                   self.cols['sources'][i]['output'])
            #save src in cache - end
            print()

        print('extractCols() - end\n')
        return self.cols
    def getHTMLAndTextForURILst(col,
                                outfilename=None,
                                printSuffix='',
                                extraParams=None):

        if (extraParams is None):
            extraParams = {}

        extraParams.setdefault('simCacheLookup', True)

        jobsLst = []
        statusCodeJobsLst = []
        jobSize = len(col['uris'])
        for i in range(jobSize):

            uri = col['uris'][i]

            if ('hash' not in uri):
                uri['hash'] = getURIHash(uri['uri'])

            if (PrecEval.uriDctHasBasics(uri)
                    and extraParams['simCacheLookup']):
                #ignore already proc. files, usually already proc. segments
                #except cache lookup is off
                continue

            #attempt - cache - start
            cosineSimFile = './Caches/CosineSim/' + col['uris'][i][
                'hash'] + '.json'
            if (os.path.exists(cosineSimFile)
                    and extraParams['simCacheLookup']):

                cache = getDictFromFile(cosineSimFile)
                if (PrecEval.uriDctHasBasics(cache)):
                    uri['text'] = cache['text']
                    uri['text-len'] = cache['text-len']
                    uri['title'] = cache['title']
                    uri['status-code'] = cache['status-code']
                    #print('\t\tskipping since cache available')
                    continue

            if ('custom' in uri):
                if ('mime' in uri['custom']):
                    if (uri['custom']['mime'] != 'text/html'):

                        print('\tskipping', uri['custom']['mime'])
                        uri['text'] = 'NoneHTML'
                        uri['text-len'] = 8

                        uri.setdefault('title', '')
                        uri.setdefault('status-code', -1)
                        continue
            '''		
				txtFile = './Caches/Plaintext/' + uri['hash'] + '.txt'
				htmlFile = './Caches/HTML/' + uri['hash'] + '.html'
				if( os.path.exists(txtFile) ):
					uri['text'] = readTextFromFile(txtFile)
					uri['text-len'] = len(uri['text'])
					uri['title'] = extractPageTitleFromHTML( readTextFromFile(htmlFile) )
					continue
			'''
            #attempt - cache - end

            jobsLst.append({
                'func':
                mimicBrowser,
                'args': {
                    'uri': uri['uri'],
                    'extraParams': {
                        'sizeRestrict': 4000000
                    }
                },
                'misc': {
                    'i': i,
                    'hash': uri['hash']
                },
                'print':
                'gtHTML.URILst->dfURI(): ' + str(i) + ' of ' + str(jobSize) +
                printSuffix  #+ '\n\tu: ' + uri['uri']
            })

            statusCodeJobsLst.append({
                'func':
                mimicBrowser,
                'args': {
                    'uri': uri['uri'],
                    'getRequestFlag': False,
                    'extraParams': None
                },
                'misc': {
                    'i': i,
                    'hash': uri['hash']
                },
                'print':
                'gtHTML.URILst->mkHdReq.(): ' + str(i) + ' of ' +
                str(jobSize) + printSuffix
            })

        resLst = []
        if (len(jobsLst) != 0):
            resLst = parallelTask(jobsLst, threadCount=3)

        for res in resLst:

            html = res['output']
            plaintext = clean_html(html)
            indx = res['misc']['i']

            col['uris'][indx]['text'] = plaintext
            col['uris'][indx]['text-len'] = len(plaintext)
            col['uris'][indx]['title'] = extractPageTitleFromHTML(html)

            writeTextToFile('./Caches/HTML/' + res['misc']['hash'] + '.html',
                            html)
            print('\t\thtmllen:', len(html))
            writeTextToFile(
                './Caches/Plaintext/' + res['misc']['hash'] + '.txt',
                plaintext)
            print('\t\tplaintextlen:', len(plaintext))

        resLst = []
        if (len(statusCodeJobsLst) != 0):
            resLst = parallelTask(statusCodeJobsLst, threadCount=3)

        for res in resLst:

            headReq = res['output']
            indx = res['misc']['i']

            cache = {}
            cache['text'] = col['uris'][indx]['text']
            cache['text-len'] = col['uris'][indx]['text-len']
            cache['title'] = col['uris'][indx]['title']
            cache['status-code'] = -1

            col['uris'][indx]['status-code'] = -1
            if ('status-code' in headReq):
                cache['status-code'] = headReq['status-code']
                col['uris'][indx]['status-code'] = headReq['status-code']

            cacheFilename = './Caches/CosineSim/' + res['misc'][
                'hash'] + '.json'
            dumpJsonToFile(cacheFilename, cache)

        col['timestamp'] = getNowTime()
        if (outfilename is not None):
            dumpJsonToFile(outfilename, col)
    def prlEvalCol(col, goldstandard, removeTxt=True, extraParams=None):

        if (extraParams is None):
            extraParams = {}

        extraParams.setdefault('minTextSize', 300)
        '''
			Important note:
			1. If minTextSize is changed, If gold standard text content is change, 
				set simCacheLookup False avoid cache lookup in order to true to recalculate sim
			
			2. If gold standard sim-coeff is change, no need to do anything
		'''
        extraParams.setdefault('simCacheLookup', True)
        extraParams.setdefault('printSuffix', '')

        colsize = len(col['uris'])

        if (colsize == 0 or len(goldstandard) == 0):
            print(
                '\tprlEvalCol(): colsize is 0 or goldstandard == 0, returning')
            return -1

        if ('uris' not in goldstandard):
            print('\tprlEvalCol(): no uris in goldstandard, returning')
            return -1

        goldRange = list(range(len(goldstandard['uris'])))
        combinedGold = PrecEval.combineDocsForIndices(goldstandard['uris'],
                                                      goldRange)

        precision = 0
        validColSize = 0
        jobsLst = []
        for i in range(colsize):

            #attempt getting sim from cache - start
            cosineSimFile = './Caches/CosineSim/' + col['uris'][i][
                'hash'] + '.json'
            if (os.path.exists(cosineSimFile)
                    and extraParams['simCacheLookup']):

                cosSim = getDictFromFile(cosineSimFile)
                if ('sim' in cosSim):

                    col['uris'][i]['sim'] = cosSim['sim']

                    if (cosSim['sim'] != -1):
                        validColSize += 1

                        if (PrecEval.isRel(cosSim['sim'],
                                           goldstandard['sim-coeff'])):
                            col['uris'][i]['relevant'] = True
                            precision += 1
                        else:
                            col['uris'][i]['relevant'] = False

                    continue
            #attempt getting sim from cache - end

            noopFlag = False
            usingSubText = ''
            if (len(col['uris'][i]['text']) < extraParams['minTextSize']):
                if ('post-details' in col['uris'][i]):
                    #gold standards do not have post-details
                    if ('substitute-text' in col['uris'][i]['post-details']):

                        subText = col['uris'][i]['post-details'][
                            'substitute-text'].strip()
                        if (subText != ''):
                            col['uris'][i]['text'] = subText
                            col['uris'][i]['custom'][
                                'substitute-text-active'] = True
                            usingSubText = '\n\t\tusing subtext: ' + col[
                                'uris'][i]['uri']
                        else:
                            noopFlag = True

                    else:
                        #don't process uris with small text
                        #don't skip (continue) so cache can update
                        noopFlag = True

            matrix = [col['uris'][i]['text'], combinedGold]
            keywords = {'matrix': matrix, 'noopFlag': noopFlag}
            toPrint = '\tprlEvalCol():' + str(i) + ' of ' + str(
                colsize) + ' ' + extraParams['printSuffix'] + usingSubText

            if ('status-code' not in col['uris'][i]):
                print('\tproblem ahead for uri:', col['uris'][i]['uri'])
                print('\tproblem ahead for hash:', col['uris'][i]['hash'])
                print('\tproblem ahead for cosineSimFile:', cosineSimFile)
                print('\tproblem ahead for keys:', col['uris'][i].keys())

            cache = {
                'hash': col['uris'][i]['hash'],
                'self': cosineSimFile,
                'uri': col['uris'][i]['uri'],
                'title': col['uris'][i]['title'],
                'text': col['uris'][i]['text'],
                'text-len': len(col['uris'][i]['text']),
                'status-code': col['uris'][i]['status-code']
            }
            jobsLst.append({
                'func': PrecEval.calcPairSim,
                'args': keywords,
                'misc': {
                    'i': i,
                    'cache': cache
                },
                'print': toPrint
            })

        resLst = []
        if (len(jobsLst) != 0):
            resLst = parallelTask(jobsLst, threadCount=3)

        for res in resLst:

            indx = res['misc']['i']
            cache = res['misc']['cache']

            sim = res['output']
            col['uris'][indx]['sim'] = sim

            if (sim != -1):
                validColSize += 1

                if (PrecEval.isRel(sim, goldstandard['sim-coeff'])):
                    col['uris'][indx]['relevant'] = True
                    precision += 1
                else:
                    col['uris'][indx]['relevant'] = False

            #write cache - start
            cache['sim'] = sim
            dumpJsonToFile(cache['self'], cache)
            #write cache - end

        if (removeTxt):
            for i in range(colsize):
                if ('text' in col['uris'][i]):
                    del col['uris'][i]['text']

        if (validColSize > 0):
            return precision / validColSize
        else:
            return -1
 def updateGoldstandard(self):
     self.goldstandard['timestamp'] = getNowTime()
     dumpJsonToFile(self.goldstandardFilename, self.goldstandard)