Python entityLibrary Examples

Programming Language: Python

Namespace/Package Name: entityLib

Method/Function: entityLibrary

Examples at hotexamples.com: 13

Python entityLibrary - 13 examples found. These are the top rated real world Python examples of entityLib.entityLibrary extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: getEntities.py Project: ctwiz/sourcereader

def getEntities(searchText, title, jsonOut=True, byID=False):
    infoModule.info.page['outline'] = searchText
    infoModule.info.page['title'] = title

    entities.entityFinder(title + ' ' + searchText, True)
    entities.nicknameFinder(title + ' ' + searchText, True, True)
    entities.setPrimo()
    #pprint.pprint(infoModule.info.entityList)
    
    entityList = infoModule.info.entityList
    if jsonOut == True:
        #res = json.dumps(infoModule.info.entityList)
        #pprint.pprint(res)
        if len(entityList) > 0:
            ents = entityList.keys()

            # hacky JSON building; python's json module outputs slightly different than what we need
            res = '['
            for ids in ents:
                entityName = entityLib.entityLibrary(ids, 'entityName')
                celeb_type = entityLib.entityLibrary(ids, 'celeb_type')
                linkPath   = entityLib.entityLibrary(ids, 'linkPath')
                entityURL   = entityLib.entityLibrary(ids, 'lookupUrl')
                if byID:
                    #swap URL for ID
                    entityURL = str(ids)
            
                if celeb_type != 'hidden':
                    if 'nameUsed' in entityList[ids]:
                        #this means there's a nickname response
                        res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '","nameUsed":"' + entityList[ids]['nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},'
                    else:
                        res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},'
            res = res[:-1] + ']'
            #catch for nothing but hiddens
            if res == ']':
                res = ''
            #pprint.pprint(res)
        else:
            res = ''
    return res

Example #2

Show file

File: discoverEntities.py Project: dpgailey/sourcereader

def getTeamFromCity():
    ###################################################################
    ##func: getRelevantEntity
    ##param: none (takes entities from infoModule.info.entityList)
    ##desc: determines missing team given city(s) and athlete(s), by
    ##      summing the relevance scores of story city(s)/all teams with
    ##      story athlete(s)/all teams.
    ##ret: cid of team with highest relevance
    ##auth: mdk
    ##################################################################

    cityIds = []
    athleteIds = []

    for cid in infoModule.info.entityList.keys():
        cidType = entityLib.entityLibrary(cid, 'celeb_type')
        if cidType == 'Team':
            return False
        if cidType == 'City':
            cityIds.append(cid)
        elif cidType == 'Athlete':
            athleteIds.append(cid)
#        else:
#            print cidType + " " + str(cid)

# exit if no cities identified
    if len(cityIds) == 0:
        return False

    # exit if no athletes identified
    if len(athleteIds) == 0:
        return False

    #log.plog('cityIds: ' + cityIds + 'athleteIds: ' + athleteIds, 2)

    athleteDict = {}
    cityDict = {}

    if len(cityIds) > 1:
        cityidIndexCnt = len(cityIds)
    else:
        cityidIndexCnt = 1
    cityIterator = 1
    while cityIterator <= cityidIndexCnt:
        for id in cityIds:
            cityDict[cityIterator] = {}
            # create dict of team => relevance
            sql = 'SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = ' + id + ' AND mptype_id = 75'
            cityTeamRelQ = mysql_tools.mysqlQuery(
                sql, infoModule.info.site['dblink'])
            while (1):
                cityTeamRel = cityTeamRelQ.fetch_row(1, 1)
                if cityTeamRel == ():
                    break
                key = int(cityTeamRel[0]['cid_2'])
                value = int(cityTeamRel[0]['relevance'])
                cityDict[cityIterator][key] = value
            cityIterator += 1

    for id in athleteIds:
        sql = 'SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = ' + id + ' AND mptype_id = 75'
        athleteTeamRelQ = mysql_tools.mysqlQuery(
            sql, infoModule.info.site['dblink'])
        athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1)
        while (1):
            athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1)
            if athleteTeamRel == ():
                break
            key = int(athleteTeamRel[0]['cid_2'])
            value = int(athleteTeamRel[0]['relevance'])
            if key in athleteDict:
                athleteDict[key] += value
            else:
                athleteDict[key] = value

    if not athleteDict and not cityDict:
        return False
    # add cityDict values to athleteDict values to get one master dict of team => relevance
    if cityDict and athleteDict:
        cityIterator = 1
        while cityIterator <= cityidIndexCnt:
            for key, value in cityDict[cityIterator].items():
                if key in athleteDict:
                    athleteDict[key] += cityDict[cityIterator][key]
                else:
                    athleteDict[key] = value
            sortedFinalDict = sorted(athleteDict.iteritems(),
                                     key=operator.itemgetter(1),
                                     reverse=True)
            log.plog('picked team =>  ' + str(sortedFinalDict[0]), 2)
            return sortedFinalDict[0]

Example #3

Show file

File: addMicrodata.py Project: dpgailey/sourcereader

def addMicrodata(text, entities, fullNames=False):

    # mptypes by category
    organizations = [37, 43, 44, 48, 49, 66, 76, 79, 104, 110, 120]

    if len(entities) == 0:
        log.plog("no entities passed to addMicrodata", 5)
        return text

    if (text == None or text == ''):
        log.plog("no text passed to addMicrodata", 5)
        return text

    #celebList should be sorted from long to short
    entityNameArray = []

    for entity in entities:
        entityNameArray.append(
            [entity,
             len(entityLib.entityLibrary(entity, 'entityName'))])

    entityNameArray = sorted(entityNameArray,
                             key=lambda nameLen: nameLen[1],
                             reverse=True)

    htmlBlocks = []
    #set aside all html so that celeb matches are not made inside
    ctr = 0
    while True:
        reres = re.search('<.*?>', text)
        if reres == None:
            break
        htmlBlocks.append(reres.group(0))
        text = text.replace(reres.group(0), "~*~%d~*~" % ctr)
        #        print"TEXT: " + text
        ctr += 1
    for entityTuple in entityNameArray:
        pprint.pprint(entityTuple)
        if entityLib.entityLibrary(entityTuple[0],
                                   'visibility') != 'invisible':
            name = entityLib.entityLibrary(entityTuple[0], 'entityName')
            name = name.strip()

            itemtype = itemprop = None
            # Person microdata
            if (name != '' and
                (entityLib.entityLibrary(entityTuple[0], 'human') == str(1))):
                itemtype = 'Person'
                itemprop = 'name'
            if int(entityLib.entityLibrary(entityTuple[0],
                                           'mptype_id')) in organizations:
                itemtype = 'Person'
                itemprop = 'affiliation'
            if itemtype and itemprop != None:
                itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                step1 = re.search(
                    "([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I)
                if step1 != None:
                    text = text.replace(
                        step1.group(0),
                        step1.group(1) + itemscope + '<span itemprop="' +
                        itemprop + '">' + name + '</span></span>' +
                        step1.group(2) + step1.group(3))
                step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text)
                if step2 != None:
                    text = text.replace(
                        step2.group(0), itemscope + '<span itemprop="' +
                        itemprop + '">' + name + '</span></span>' +
                        step2.group(1) + step2.group(2))
                step3 = re.search('\\b' + name + '$', text, re.I)
                if step3 != None:
                    text = text.replace(
                        step3.group(0), itemscope + '<span itemprop="' +
                        itemprop + '">' + name + '</span></span>')

    # now check by first and last name
    #in between passes, alter text to have ~(num)~ where the celeb blocks are
    tmpArray = []
    tmpCtr = 0
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    if fullNames == False:
        #only do first and last name if the fullNames (meaning require full name) isn't set to true
        while True:
            squareBlocks = re.search('<span itemscope.*?<\/span><\/span>',
                                     text)
            pprint.pprint(squareBlocks)
            if squareBlocks == None:
                break
            tmpArray.append(squareBlocks.group(0))
            text = text.replace(squareBlocks.group(0),
                                "~#~" + str(tmpCtr) + "~#~")
            tmpCtr += 1

        for entityTuple in entityNameArray:
            pprint.pprint(entityTuple)
            if entityLib.entityLibrary(entityTuple[0],
                                       'visibility') != 'invisible':
                lname = entityLib.entityLibrary(entityTuple[0], 'lname')
                if lname != None:
                    lname = lname.strip()
                fname = entityLib.entityLibrary(entityTuple[0], 'fname')
                if fname != None:
                    fname = fname.strip()

                itemtype = itemprop = None
                if (lname != '' and (entityLib.entityLibrary(
                        entityTuple[0], 'human') == str(1))):
                    itemtype = 'Person'
                    itemprop = 'name'
                if (fname != '' and (entityLib.entityLibrary(
                        entityTuple[0], 'human') == str(1))):
                    itemtype = 'Person'
                    itemprop = 'name'
                # affiliation microdata
                # Get mytype_id and see if res is in organization array
                if int(entityLib.entityLibrary(entityTuple[0],
                                               'mptype_id')) in organizations:
                    itemtype = 'Person'
                    itemprop = 'affiliation'
                if itemtype and itemprop != None:
                    itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                    # don't use first and last name if fname or lname is a number
                    try:
                        int(fname)
                    except ValueError:
                        if fname != None and fname != '':
                            step1 = re.search(
                                "([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}",
                                text, re.I)
                            if step1 != None:
                                text = text.replace(
                                    step1.group(0),
                                    step1.group(1) + itemscope +
                                    '<span itemprop="' + itemprop + '">' +
                                    fname + '</span></span>' + step1.group(2) +
                                    step1.group(3))
                            step2 = re.search(
                                "^" + fname + "(\'*s*)\\b([^\[]){1}", text)
                            if step2 != None:
                                text = text.replace(
                                    step2.group(0),
                                    itemscope + '<span itemprop="' + itemprop +
                                    '">' + fname + '</span></span>' +
                                    step2.group(1) + step2.group(2))
                            step3 = re.search('\\b' + fname + '$', text, re.I)
                            if step3 != None:
                                text = text.replace(
                                    step3.group(0),
                                    itemscope + '<span itemprop="' + itemprop +
                                    '">' + fname + '</span></span>')
                    try:
                        int(lname)
                    except ValueError:
                        if lname != None and lname != '':
                            step1 = re.search(
                                "([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}",
                                text, re.I)
                            if step1 != None:
                                text = text.replace(
                                    step1.group(0),
                                    step1.group(1) + itemscope +
                                    '<span itemprop="' + itemprop + '">' +
                                    lname + '</span></span>' + step1.group(2) +
                                    step1.group(3))
                            step2 = re.search(
                                "^" + lname + "(\'*s*)\\b([^\[]){1}", text)
                            if step2 != None:
                                text = text.replace(
                                    step2.group(0),
                                    itemscope + '<span itemprop="' + itemprop +
                                    '">' + lname + '</span></span>' +
                                    step2.group(1) + step2.group(2))
                            step3 = re.search('\\b' + lname + '$', text, re.I)
                            if step3 != None:
                                text = text.replace(
                                    step3.group(0),
                                    itemscope + '<span itemprop="' + itemprop +
                                    '">' + lname + '</span></span>')

    #nicknames, after the rest are done
    #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    while True:
        squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text)
        if squareBlocks == None:
            break
        tmpArray.append(squareBlocks.group(0))
        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
        tmpCtr += 1

    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0],
                                   'visibility') != 'invisible':
            itemtype = itemprop = None
            if entityLib.entityLibrary(entityTuple[0], 'human') == str(1):
                itemtype = 'Person'
                itemprop = 'name'
            # affiliation microdata
            # Get mytype_id and see if res is in organization array
            if int(entityLib.entityLibrary(entityTuple[0],
                                           'mptype_id')) in organizations:
                itemtype = 'Person'
                itemprop = 'affiliation'
            if itemtype and itemprop != None:
                itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                nicknamesQ = mysql_tools.mysqlQuery(
                    "select name, case_sensitive from db_topics.nicknames where cid_1="
                    + str(entityTuple[0]) + " or cid_2=" +
                    str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]),
                    infoModule.info.site['dblink'])
                while True:
                    nicknameRow = nicknamesQ.fetch_row(1, 1)
                    if nicknameRow == ():
                        break
                    if nicknameRow[0]['case_sensitive'] == 1:
                        nicknameMatch = re.search(
                            '\\b' + nicknameRow[0]['name'] + '\\b', text, re.I)
                    else:
                        nicknameMatch = re.search(
                            '\\b' + nicknameRow[0]['name'] + '\\b', text)
                    if nicknameMatch and itemtype != None:
                        text = text.replace(
                            nicknameMatch.group(0),
                            itemscope + '<span itemprop="' + itemprop + '">' +
                            nicknameMatch.group(0) + '</span></span>')

                    #take it out and store it off to prevent more dupes
                    while True:
                        squareBlocks = re.search(
                            '<span itemscope.*?<\/span><\/span>', text)
                        if squareBlocks == None:
                            break
                        tmpArray.append(squareBlocks.group(0))
                        text = text.replace(squareBlocks.group(0),
                                            "~#~" + str(tmpCtr) + "~#~")
                        tmpCtr += 1

    while True:
        repBlock = re.search('~#~(\d+)~#~', text)
        if repBlock == None:
            break
        text = text.replace(repBlock.group(0),
                            tmpArray[int(repBlock.group(1))])

#    text = text.replace("[/celeb]'s", "'s[/celeb]")
#    text = text.replace("[/celeb]s", "s[/celeb]")

    for i in range(len(htmlBlocks)):
        text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i])

    return text

Example #4

Show file

def setPrimo():
    ###################################################################
    ##func: setPrimo
    ##param: none
    ##desc: takes no params.  Instead, as long as page title, page text
    ##      and celebList are properly put together, it will rank the entities
    ##      for primo position
    ##ret: None
    ##auth: esr
    ##################################################################

    if 'outline' not in infoModule.info.page or infoModule.info.page[
            'outline'] == '':
        log.plog('no outline set before setPrimo called', 5)
        return

    if 'title' not in infoModule.info.page or infoModule.info.page[
            'title'] == '':
        log.plog('no title set before setPrimo called', 5)
        return

    if len(infoModule.info.entityList) == 0:
        log.plog('no entities set before setPrimo called', 5)
        return

    #highlight text as a way to count the frequency
    highlightedText = highlightEntities.highlightEntitiesFromList(
        infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'],
        infoModule.info.entityList)
    while True:
        taggedEntity = re.search('\[celeb url=(.+?)\]', highlightedText)
        if taggedEntity == None:
            break
        highlightedText = highlightedText.replace(taggedEntity.group(0), '', 1)
        lookupUrl = taggedEntity.group(1)
        sql = "select celeb_id from db_topics.celebs where lookupUrl='" + taggedEntity.group(
            1) + "'"
        entityByLookupQ = mysql_tools.mysqlQuery(
            sql, infoModule.info.site['dblink'])
        while (1):
            entityByLookup = entityByLookupQ.fetch_row(1, 1)
            if entityByLookup == ():
                break
            if infoModule.info.entityList[entityByLookup[0]
                                          ['celeb_id']]['frequency'] == 0:
                infoModule.info.entityList[entityByLookup[0]
                                           ['celeb_id']]['frequency'] = 1
            else:
                infoModule.info.entityList[entityByLookup[0]
                                           ['celeb_id']]['frequency'] += 1

    # now frequency is set, first two are based on position, next two based on frequency
    primoTypes = ['Y', '2', '3', '4', 'N']
    primoTypeCursor = 0

    #if source is associated with entity, it's auto-set to primo
    if 'celeb_id' in infoModule.info.source and infoModule.info.source[
            'celeb_id'] != '' and int(infoModule.info.source['celeb_id']) > 0:
        infoModule.info.entityList[infoModule.info.source['celeb_id']][
            'primo'] = primoTypes[primoTypeCursor]
        primoTypeCursor += 1

    posArray = []
    entKeys = infoModule.info.entityList.keys()
    for eKey in entKeys:
        if infoModule.info.entityList[eKey]['position'] != None:
            posArray.append(
                [eKey, infoModule.info.entityList[eKey]['position']])

    posArray.sort(key=lambda x: x[1])

    freqArray = []
    entKeys = infoModule.info.entityList.keys()
    for eKey in entKeys:
        if infoModule.info.entityList[eKey][
                'frequency'] != None and infoModule.info.entityList[eKey][
                    'frequency'] > 1:
            freqArray.append(
                [eKey, infoModule.info.entityList[eKey]['frequency']])

    freqArray.sort(key=lambda x: x[1], reverse=True)

    titleLen = len(infoModule.info.page['title'])
    #primo set by position in title
    ctr = 0
    while primoTypeCursor < 4 and len(posArray) > ctr:
        if posArray[ctr][1] > titleLen:
            break
        ## only non-hidden entities can be primo
        if entityLib.entityLibrary(posArray[ctr][0],
                                   'visibility') != 'invisible':
            log.plog(
                "entity %s found in title, setting to primo %s" %
                (posArray[ctr][0], primoTypes[primoTypeCursor]), 3)
            infoModule.info.entityList[
                posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
            primoTypeCursor += 1
        ctr += 1

    #primo set by frequency
    ctr = 0
    while primoTypeCursor < 4 and ctr < len(freqArray):
        if infoModule.info.entityList[freqArray[ctr][0]]['primo'] == 'N':
            ## only non-hidden entities can be primo
            if entityLib.entityLibrary(freqArray[ctr][0],
                                       'visibility') != 'invisible':
                log.plog(
                    "entity %s has high frequency, setting to primo %s" %
                    (freqArray[ctr][0], primoTypes[primoTypeCursor]), 3)
                infoModule.info.entityList[
                    freqArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
                primoTypeCursor += 1
        ctr += 1

    #if nothing found, go to first position and set it to primo
    if primoTypeCursor == 0:
        ctr = 0
        while len(posArray) > ctr:
            ## only non-hidden entities can be primo
            if entityLib.entityLibrary(posArray[ctr][0],
                                       'visibility') != 'invisible':
                log.plog(
                    "entity %s found first in story, setting to primo %s" %
                    (posArray[ctr][0], primoTypes[primoTypeCursor]), 3)
                infoModule.info.entityList[
                    posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
                primoTypeCursor += 1
                break
            ctr += 1

Example #5

Show file

File: entityLib_ut.py Project: dpgailey/sourcereader

link = mysql_tools.mysqlConnect()
if link == False :
    print "no connection"
    sys.exit(0)
    
infoModule.info.site['dblink'] = link	
infoModule.info.site['log_priority_threshold'] = 3	

if len(sys.argv) > 1 and int(sys.argv[1]) > 0:
    #get specific sub_id
    print "looking up entity_id: " + sys.argv[1]
    cid = sys.argv[1]

    print "%s:" % cid
    print "\t%s" % entityLib.entityLibrary(int(cid), 'entityName')
    print "\t%s" % entityLib.entityLibrary(int(cid), 'lookupUrl')
    print "\t%s" % entityLib.entityLibrary(int(cid), 'bio')


else:
    sql = "select celeb_id from db_topics.celebs order by rand() limit 20"
    er = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
    
    if er == False:
        print "select random entities failed"
        sys.exit(0)
    
    
    while (1):
        row=er.fetch_row(1,1)

Example #6

Show file

File: entities.py Project: ctwiz/sourcereader

def setPrimo():       
    ###################################################################
    ##func: setPrimo
    ##param: none
    ##desc: takes no params.  Instead, as long as page title, page text
    ##      and celebList are properly put together, it will rank the entities
    ##      for primo position
    ##ret: None
    ##auth: esr
    ##################################################################
    
    if 'outline' not in infoModule.info.page or infoModule.info.page['outline'] == '':
        log.plog('no outline set before setPrimo called', 5)
        return

    if 'title' not in infoModule.info.page or infoModule.info.page['title'] == '':
        log.plog('no title set before setPrimo called', 5)
        return
        
    if len(infoModule.info.entityList) == 0:
        log.plog('no entities set before setPrimo called', 5)
        return
        
    #highlight text as a way to count the frequency
    highlightedText = highlightEntities.highlightEntitiesFromList(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], infoModule.info.entityList)
    while True:
        taggedEntity = re.search('\[celeb url=(.+?)\]', highlightedText)
        if taggedEntity == None:
            break
        highlightedText = highlightedText.replace(taggedEntity.group(0), '', 1)
        lookupUrl = taggedEntity.group(1)
        sql = "select celeb_id from db_topics.celebs where lookupUrl='" + taggedEntity.group(1) + "'"
        entityByLookupQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])  
        while (1):
            entityByLookup=entityByLookupQ.fetch_row(1,1)
            if entityByLookup == ():
                break
            if infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] == 0:
                infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] = 1
            else:
                infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] += 1
                
    # now frequency is set, first two are based on position, next two based on frequency
    primoTypes = ['Y', '2', '3', '4', 'N']
    primoTypeCursor = 0
    
    #if source is associated with entity, it's auto-set to primo
    if 'celeb_id' in infoModule.info.source and infoModule.info.source['celeb_id'] != '' and int(infoModule.info.source['celeb_id']) > 0:
        infoModule.info.entityList[infoModule.info.source['celeb_id']]['primo'] = primoTypes[primoTypeCursor]
        primoTypeCursor += 1


    posArray = []
    entKeys = infoModule.info.entityList.keys()
    for eKey in entKeys:
        if infoModule.info.entityList[eKey]['position'] != None:
            posArray.append([eKey, infoModule.info.entityList[eKey]['position'] ])
    
    posArray.sort(key=lambda x: x[1])
    
    freqArray = []
    entKeys = infoModule.info.entityList.keys()
    for eKey in entKeys:
        if infoModule.info.entityList[eKey]['frequency'] != None and infoModule.info.entityList[eKey]['frequency'] > 1:
            freqArray.append([eKey, infoModule.info.entityList[eKey]['frequency'] ])

    freqArray.sort(key=lambda x: x[1], reverse=True)
    
    titleLen = len(infoModule.info.page['title'])
    #primo set by position in title
    ctr = 0
    while primoTypeCursor < 4 and len(posArray) > ctr:
        if posArray[ctr][1] > titleLen:
            break
        ## only non-hidden entities can be primo
        if entityLib.entityLibrary(posArray[ctr][0], 'visibility') != 'invisible':
            log.plog("entity %s found in title, setting to primo %s" % (posArray[ctr][0], primoTypes[primoTypeCursor]), 3)
            infoModule.info.entityList[posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
            primoTypeCursor += 1
        ctr += 1 

    #primo set by frequency
    ctr = 0
    while primoTypeCursor < 4 and ctr < len(freqArray):
        if infoModule.info.entityList[freqArray[ctr][0]]['primo'] == 'N':
            ## only non-hidden entities can be primo
            if entityLib.entityLibrary(freqArray[ctr][0], 'visibility') != 'invisible':
                log.plog("entity %s has high frequency, setting to primo %s" % (freqArray[ctr][0], primoTypes[primoTypeCursor]), 3)
                infoModule.info.entityList[freqArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
                primoTypeCursor += 1
        ctr += 1 
        
    #if nothing found, go to first position and set it to primo
    if primoTypeCursor == 0:
        ctr = 0
        while len(posArray) > ctr:
            ## only non-hidden entities can be primo
            if entityLib.entityLibrary(posArray[ctr][0], 'visibility') != 'invisible':
                log.plog("entity %s found first in story, setting to primo %s" % (posArray[ctr][0], primoTypes[primoTypeCursor]), 3)
                infoModule.info.entityList[posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
                primoTypeCursor += 1
                break
            ctr += 1

Example #7

Show file

def highlightEntitiesFromList(text, entities, fullNames=False):
    ###################################################################
    ##func: highlightEntitiesFromList
    ##param: text, entities
    ##desc: takes a list of entities (id numbers) and finds all name matches
    ##      and nickname matches and wraps those in the text with
    ##      [celeb url="celeb-url"][/celeb]
    ##      in the source_reader context, it will probably be phased out
    ##      its current usage is to mark up the outline of a page and to
    ##      be a tool for determining the frequency with which entities appear
    ##ret: string
    ##auth: esr
    ##################################################################

    if len(entities) == 0:
        log.plog("no entities passed to highlightEntitiesFromList", 5)
        return text

    if (text == None or text == ''):
        log.plog("no text passed to highlightEntitiesFromList", 5)
        return text

    #celebList should be sorted from long to short
    entityNameArray = []

    for entity in entities:
        entityNameArray.append(
            [entity,
             len(entityLib.entityLibrary(entity, 'entityName'))])

    entityNameArray = sorted(entityNameArray,
                             key=lambda nameLen: nameLen[1],
                             reverse=True)

    htmlBlocks = []
    #set aside all html so that celeb matches are not made inside
    ctr = 0
    while True:
        reres = re.search('<.*?>', text)
        if reres == None:
            break
        htmlBlocks.append(reres.group(0))
        text = text.replace(reres.group(0), "~*~%d~*~" % ctr)
        ctr += 1

    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0],
                                   'visibility') != 'invisible':
            name = entityLib.entityLibrary(entityTuple[0], 'entityName')
            name = name.strip()
            if name != '':
                lookupUrl = entityLib.entityLibrary(entityTuple[0],
                                                    'lookupUrl')
                step1 = re.search(
                    "([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I)
                if step1 != None:
                    text = text.replace(
                        step1.group(0),
                        step1.group(1) + "[celeb url=" + lookupUrl + "]" +
                        name + "[/celeb]" + step1.group(2) + step1.group(3))
                step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text)
                if step2 != None:
                    text = text.replace(
                        step2.group(0), "[celeb url=" + lookupUrl + "]" +
                        name + "[/celeb]" + step2.group(1) + step2.group(2))

                step3 = re.search('\\b' + name + '$', text, re.I)
                if step3 != None:
                    text = text.replace(
                        step3.group(0),
                        "[celeb url=" + lookupUrl + "]" + name + "[/celeb]")

    # now check by first and last name
    #in between passes, alter text to have ~(num)~ where the celeb blocks are
    tmpArray = []
    tmpCtr = 0
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    if fullNames == False:
        #only do first and last name if the fullNames (meaning require full name) isn't set to true
        while True:
            squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
            if squareBlocks == None:
                break
            tmpArray.append(squareBlocks.group(0))
            text = text.replace(squareBlocks.group(0),
                                "~#~" + str(tmpCtr) + "~#~")
            tmpCtr += 1

        for entityTuple in entityNameArray:
            if entityLib.entityLibrary(entityTuple[0],
                                       'visibility') != 'invisible':
                lname = entityLib.entityLibrary(entityTuple[0], 'lname')
                if lname != None:
                    lname = lname.strip()
                fname = entityLib.entityLibrary(entityTuple[0], 'fname')
                if fname != None:
                    fname = fname.strip()
                lookupUrl = entityLib.entityLibrary(entityTuple[0],
                                                    'lookupUrl')
                # don't use first and last name if fname or lname is a number
                try:
                    int(fname)
                except ValueError:
                    if fname != None and fname != '':
                        step1 = re.search(
                            "([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}",
                            text, re.I)
                        if step1 != None:
                            text = text.replace(
                                step1.group(0),
                                step1.group(1) + "[celeb url=" + lookupUrl +
                                "]" + fname + "[/celeb]" + step1.group(2) +
                                step1.group(3))
                        step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}",
                                          text)
                        if step2 != None:
                            text = text.replace(
                                step2.group(0),
                                "[celeb url=" + lookupUrl + "]" + fname +
                                "[/celeb]" + step2.group(1) + step2.group(2))

                        step3 = re.search('\\b' + fname + '$', text, re.I)
                        if step3 != None:
                            text = text.replace(
                                step3.group(0), "[celeb url=" + lookupUrl +
                                "]" + lname + "[/celeb]")
                try:
                    int(lname)
                except ValueError:
                    if lname != None and lname != '':
                        step1 = re.search(
                            "([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}",
                            text, re.I)
                        if step1 != None:
                            text = text.replace(
                                step1.group(0),
                                step1.group(1) + "[celeb url=" + lookupUrl +
                                "]" + lname + "[/celeb]" + step1.group(2) +
                                step1.group(3))
                        step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}",
                                          text)
                        if step2 != None:
                            text = text.replace(
                                step2.group(0),
                                "[celeb url=" + lookupUrl + "]" + lname +
                                "[/celeb]" + step2.group(1) + step2.group(2))

                        step3 = re.search('\\b' + lname + '$', text, re.I)
                        if step3 != None:
                            text = text.replace(
                                step3.group(0), "[celeb url=" + lookupUrl +
                                "]" + lname + "[/celeb]")

    #nicknames, after the rest are done
    #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    while True:
        squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
        if squareBlocks == None:
            break
        tmpArray.append(squareBlocks.group(0))
        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
        tmpCtr += 1

    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0],
                                   'visibility') != 'invisible':
            lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl')
            nicknamesQ = mysql_tools.mysqlQuery(
                "select name, case_sensitive from db_topics.nicknames where cid_1="
                + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) +
                " or cid_3=" + str(entityTuple[0]),
                infoModule.info.site['dblink'])
            while True:
                nicknameRow = nicknamesQ.fetch_row(1, 1)
                if nicknameRow == ():
                    break
                if nicknameRow[0]['case_sensitive'] == 1:
                    nicknameMatch = re.search(
                        '\\b' + nicknameRow[0]['name'] + '\\b', text, re.I)
                else:
                    nicknameMatch = re.search(
                        '\\b' + nicknameRow[0]['name'] + '\\b', text)
                if nicknameMatch != None:
                    text = text.replace(
                        nicknameMatch.group(0), "[celeb url=" + lookupUrl +
                        "]" + nicknameMatch.group(0) + "[/celeb]")

                #take it out and store it off to prevent more dupes
                while True:
                    squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
                    if squareBlocks == None:
                        break
                    tmpArray.append(squareBlocks.group(0))
                    text = text.replace(squareBlocks.group(0),
                                        "~#~" + str(tmpCtr) + "~#~")
                    tmpCtr += 1

    while True:
        repBlock = re.search('~#~(\d+)~#~', text)
        if repBlock == None:
            break
        text = text.replace(repBlock.group(0),
                            tmpArray[int(repBlock.group(1))])

    text = text.replace("[/celeb]'s", "'s[/celeb]")
    text = text.replace("[/celeb]s", "s[/celeb]")

    for i in range(len(htmlBlocks)):
        text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i])

    return text

Example #8

Show file

File: discoverEntities_ut.py Project: dpgailey/sourcereader


def getIds(sub_id):
    sql = 'SELECT celeb_id FROM ' + infoModule.info.site[
        'database'] + '.subs_celebs WHERE sub_id = ' + str(sub_id)
    entityIdsQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
    entityIds = entityIdsQ.fetch_row(0, 1)

    entityRows = []
    for row in entityIds:
        infoModule.info.entityList[row['celeb_id']] = {
            'position': None,
            'frequency': 0,
            'primo': 'N'
        }


if (1):
    # http://dev.sportifi.com/news/Carmelo-about-to-put-fans-in-an-awkward-position-52139.html
    getIds(52139)

    # delete team for testing purposes
    del infoModule.info.entityList['853957']

    for cid in infoModule.info.entityList.keys():
        cidType = entityLib.entityLibrary(cid, 'celeb_type')
        print str(cid) + " - " + cidType

    result = getRelevantEntity()
    print result

Example #9

Show file

File: getEntities.py Project: dpgailey/sourcereader

def getEntities(searchText, title, jsonOut=True, byID=False):
    infoModule.info.page['outline'] = searchText
    infoModule.info.page['title'] = title

    entities.entityFinder(title + ' ' + searchText, True)
    entities.nicknameFinder(title + ' ' + searchText, True, True)
    entities.setPrimo()
    #pprint.pprint(infoModule.info.entityList)

    entityList = infoModule.info.entityList
    if jsonOut == True:
        #res = json.dumps(infoModule.info.entityList)
        #pprint.pprint(res)
        if len(entityList) > 0:
            ents = entityList.keys()

            # hacky JSON building; python's json module outputs slightly different than what we need
            res = '['
            for ids in ents:
                entityName = entityLib.entityLibrary(ids, 'entityName')
                celeb_type = entityLib.entityLibrary(ids, 'celeb_type')
                linkPath = entityLib.entityLibrary(ids, 'linkPath')
                entityURL = entityLib.entityLibrary(ids, 'lookupUrl')
                if byID:
                    #swap URL for ID
                    entityURL = str(ids)

                if celeb_type != 'hidden':
                    if 'nameUsed' in entityList[ids]:
                        #this means there's a nickname response
                        res = res + '{"id":"' + str(
                            entityURL
                        ) + '","name":"' + str(
                            entityName
                        ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(
                            celeb_type
                        ) + '","nameUsed":"' + entityList[ids][
                            'nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(
                                linkPath
                            ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(
                                linkPath
                            ) + '","icon":"http://informifi.com/fi_icon.png"}]},'
                    else:
                        res = res + '{"id":"' + str(
                            entityURL
                        ) + '","name":"' + str(
                            entityName
                        ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(
                            celeb_type
                        ) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(
                            linkPath
                        ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(
                            linkPath
                        ) + '","icon":"http://informifi.com/fi_icon.png"}]},'
            res = res[:-1] + ']'
            #catch for nothing but hiddens
            if res == ']':
                res = ''
            #pprint.pprint(res)
        else:
            res = ''
    return res

Example #10

Show file

File: addMicrodata.py Project: ctwiz/sourcereader

def addMicrodata(text, entities, fullNames = False):

    # mptypes by category
    organizations = [37,43,44,48,49,66,76,79,104,110,120]
    
    if len(entities) == 0:
        log.plog("no entities passed to addMicrodata", 5)
        return text

    if (text == None or text == ''):
        log.plog("no text passed to addMicrodata", 5)
        return text
        
    #celebList should be sorted from long to short
    entityNameArray=[]
   
    for entity in entities:
        entityNameArray.append( [entity, len(entityLib.entityLibrary(entity, 'entityName'))] )
    
    entityNameArray = sorted(entityNameArray, key=lambda nameLen: nameLen[1], reverse=True)
    
    htmlBlocks = []
    #set aside all html so that celeb matches are not made inside
    ctr = 0
    while True:
        reres = re.search('<.*?>', text)
        if reres == None:
            break
        htmlBlocks.append(reres.group(0))
        text = text.replace(reres.group(0), "~*~%d~*~" % ctr)
#        print"TEXT: " + text
        ctr += 1
    for entityTuple in entityNameArray:
        pprint.pprint(entityTuple)
        if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
            name = entityLib.entityLibrary(entityTuple[0], 'entityName')
            name = name.strip()
            
            itemtype = itemprop = None
            # Person microdata
            if ( name != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ):
                itemtype = 'Person'
                itemprop = 'name'            
            if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations:
                itemtype = 'Person'
                itemprop = 'affiliation'
            if itemtype and itemprop != None:
                itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                step1 = re.search("([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I)
                if step1 != None:
                    text = text.replace(step1.group(0), step1.group(1) + itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>' + step1.group(2) + step1.group(3))
                step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text)
                if step2 != None:
                    text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>' + step2.group(1) + step2.group(2))
                step3 = re.search('\\b' + name + '$', text, re.I)
                if step3 != None:
                    text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>')	

    # now check by first and last name
    #in between passes, alter text to have ~(num)~ where the celeb blocks are
    tmpArray = []
    tmpCtr = 0
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    if fullNames == False:
        #only do first and last name if the fullNames (meaning require full name) isn't set to true
        while True:
            squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text)
            pprint.pprint(squareBlocks)
            if squareBlocks == None:
                break
            tmpArray.append(squareBlocks.group(0))            
            text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
            tmpCtr += 1
    
        for entityTuple in entityNameArray:
            pprint.pprint(entityTuple)
            if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
                lname = entityLib.entityLibrary(entityTuple[0], 'lname')
                if lname != None:
                    lname = lname.strip()
                fname = entityLib.entityLibrary(entityTuple[0], 'fname')
                if fname != None:
                    fname = fname.strip()
                
                itemtype = itemprop = None
                if ( lname != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ):
                    itemtype = 'Person'
                    itemprop = 'name'
                if ( fname != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ):
                    itemtype = 'Person'
                    itemprop = 'name'            
                # affiliation microdata
                # Get mytype_id and see if res is in organization array
                if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations:
                    itemtype = 'Person'
                    itemprop = 'affiliation'
                if itemtype and itemprop != None:
                    itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                    # don't use first and last name if fname or lname is a number
                    try:
                        int(fname)
                    except ValueError:
                        if fname != None and fname != '':
                            step1 = re.search("([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}", text, re.I)
                            if step1 != None:
                                text = text.replace(step1.group(0), step1.group(1) +  itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>' + step1.group(2) + step1.group(3))
                            step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}", text)
                            if step2 != None:
                                 text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>' + step2.group(1) + step2.group(2))    
                            step3 = re.search('\\b' + fname + '$', text, re.I)
                            if step3 != None:
                                text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>')		
                    try:
                        int(lname)
                    except ValueError:
                        if lname != None and lname != '':
                            step1 = re.search("([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}", text, re.I)
                            if step1 != None:
                                text = text.replace(step1.group(0), step1.group(1) +  itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>' + step1.group(2) + step1.group(3))
                            step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}", text)
                            if step2 != None:
                                text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>' + step2.group(1) + step2.group(2))
                            step3 = re.search('\\b' + lname + '$', text, re.I)
                            if step3 != None:
                                text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>')		

    #nicknames, after the rest are done
    #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    while True:
        squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text)
        if squareBlocks == None:
            break
        tmpArray.append(squareBlocks.group(0))            
        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
        tmpCtr += 1
    
    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
            itemtype = itemprop = None
            if entityLib.entityLibrary(entityTuple[0], 'human') == str(1):
                itemtype = 'Person'
                itemprop = 'name'           
            # affiliation microdata
            # Get mytype_id and see if res is in organization array
            if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations:
                itemtype = 'Person'
                itemprop = 'affiliation'
            if itemtype and itemprop != None:
                itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''            
                nicknamesQ = mysql_tools.mysqlQuery("select name, case_sensitive from db_topics.nicknames where cid_1=" + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]) , infoModule.info.site['dblink'])
                while True:
                    nicknameRow=nicknamesQ.fetch_row(1,1)
                    if nicknameRow == ():
                        break
                    if nicknameRow[0]['case_sensitive'] == 1:
                        nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text, re.I)
                    else:
                        nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text)
                    if nicknameMatch and itemtype != None:
                        text = text.replace(nicknameMatch.group(0), itemscope + '<span itemprop="' + itemprop + '">' + nicknameMatch.group(0) + '</span></span>')
                
                    #take it out and store it off to prevent more dupes
                    while True:
                        squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text)
                        if squareBlocks == None:
                            break
                        tmpArray.append(squareBlocks.group(0))            
                        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
                        tmpCtr += 1

    while True:
        repBlock = re.search('~#~(\d+)~#~', text)
        if repBlock == None:
            break
        text = text.replace(repBlock.group(0), tmpArray[int(repBlock.group(1))])
        
#    text = text.replace("[/celeb]'s", "'s[/celeb]")
#    text = text.replace("[/celeb]s", "s[/celeb]")
	
    for i in range(len(htmlBlocks)):
        text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i])
            
    return text

Example #11

Show file

File: discoverEntities_ut.py Project: ctwiz/sourcereader

	sys.exit(0)
	
# set globals for site
infoModule.info.site['database'] = 'db_sportifi'
infoModule.info.site['dblink']   = link

def getIds(sub_id):
    sql = 'SELECT celeb_id FROM ' + infoModule.info.site['database'] + '.subs_celebs WHERE sub_id = ' + str(sub_id)
    entityIdsQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) 
    entityIds = entityIdsQ.fetch_row(0,1)

    entityRows = []
    for row in entityIds:
        infoModule.info.entityList[row['celeb_id']] = {'position': None, 'frequency': 0, 'primo' : 'N'}

if (1):
    # http://dev.sportifi.com/news/Carmelo-about-to-put-fans-in-an-awkward-position-52139.html
    getIds(52139)
    
    # delete team for testing purposes
    del infoModule.info.entityList['853957']

    for cid in infoModule.info.entityList.keys():
        cidType = entityLib.entityLibrary(cid,'celeb_type')
        print str(cid) + " - " + cidType

    result = getRelevantEntity()
    print result

Example #12

Show file

File: highlightEntities.py Project: ctwiz/sourcereader

def highlightEntitiesFromList(text, entities, fullNames = False):
    ###################################################################
    ##func: highlightEntitiesFromList
    ##param: text, entities
    ##desc: takes a list of entities (id numbers) and finds all name matches
    ##      and nickname matches and wraps those in the text with 
    ##      [celeb url="celeb-url"][/celeb]
    ##      in the source_reader context, it will probably be phased out
    ##      its current usage is to mark up the outline of a page and to
    ##      be a tool for determining the frequency with which entities appear
    ##ret: string
    ##auth: esr
    ##################################################################
    
    if len(entities) == 0:
        log.plog("no entities passed to highlightEntitiesFromList", 5)
        return text

    if (text == None or text == ''):
        log.plog("no text passed to highlightEntitiesFromList", 5)
        return text
        
    #celebList should be sorted from long to short
    entityNameArray=[]
   
    for entity in entities:
        entityNameArray.append( [entity, len(entityLib.entityLibrary(entity, 'entityName'))] )
    
    entityNameArray = sorted(entityNameArray, key=lambda nameLen: nameLen[1], reverse=True)
    
    htmlBlocks = []
    #set aside all html so that celeb matches are not made inside
    ctr = 0
    while True:
        reres = re.search('<.*?>', text)
        if reres == None:
            break
        htmlBlocks.append(reres.group(0))
        text = text.replace(reres.group(0), "~*~%d~*~" % ctr)
        ctr += 1
    
    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
            name = entityLib.entityLibrary(entityTuple[0], 'entityName')
            name = name.strip()
            if name != '':
                lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl')
                step1 = re.search("([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I)
                if step1 != None:
                    text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + name + "[/celeb]" + step1.group(2) + step1.group(3))
                step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text)
                if step2 != None:
                    text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + name + "[/celeb]" + step2.group(1) + step2.group(2))
    
                step3 = re.search('\\b' + name + '$', text, re.I)
                if step3 != None:
                    text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + name + "[/celeb]")		

    # now check by first and last name
    #in between passes, alter text to have ~(num)~ where the celeb blocks are
    tmpArray = []
    tmpCtr = 0
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    if fullNames == False:
        #only do first and last name if the fullNames (meaning require full name) isn't set to true
        while True:
            squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
            if squareBlocks == None:
                break
            tmpArray.append(squareBlocks.group(0))            
            text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
            tmpCtr += 1
    
        for entityTuple in entityNameArray:
            if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
                lname = entityLib.entityLibrary(entityTuple[0], 'lname')
                if lname != None:
                    lname = lname.strip()
                fname = entityLib.entityLibrary(entityTuple[0], 'fname')
                if fname != None:
                    fname = fname.strip()
                lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl')
                # don't use first and last name if fname or lname is a number
                try:
                    int(fname)
                except ValueError:
                    if fname != None and fname != '':
                        step1 = re.search("([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}", text, re.I)
                        if step1 != None:
                            text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + fname + "[/celeb]" + step1.group(2) + step1.group(3))
                        step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}", text)
                        if step2 != None:
                            text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + fname + "[/celeb]" + step2.group(1) + step2.group(2))
    
                        step3 = re.search('\\b' + fname + '$', text, re.I)
                        if step3 != None:
                            text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]")		
                try:
                    int(lname)
                except ValueError:
                    if lname != None and lname != '':
                        step1 = re.search("([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}", text, re.I)
                        if step1 != None:
                            text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]" + step1.group(2) + step1.group(3))
                        step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}", text)
                        if step2 != None:
                            text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]" + step2.group(1) + step2.group(2))
    
                        step3 = re.search('\\b' + lname + '$', text, re.I)
                        if step3 != None:
                            text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]")		

    #nicknames, after the rest are done
    #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    while True:
        squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
        if squareBlocks == None:
            break
        tmpArray.append(squareBlocks.group(0))            
        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
        tmpCtr += 1
    
    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
            lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl')
            nicknamesQ = mysql_tools.mysqlQuery("select name, case_sensitive from db_topics.nicknames where cid_1=" + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]) , infoModule.info.site['dblink'])
            while True:
                nicknameRow=nicknamesQ.fetch_row(1,1)
                if nicknameRow == ():
                    break
                if nicknameRow[0]['case_sensitive'] == 1:
                    nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text, re.I)
                else:
                    nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text)
                if nicknameMatch != None:
                    text = text.replace(nicknameMatch.group(0), "[celeb url=" + lookupUrl + "]" + nicknameMatch.group(0) + "[/celeb]")

                
                #take it out and store it off to prevent more dupes
                while True:
                    squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
                    if squareBlocks == None:
                        break
                    tmpArray.append(squareBlocks.group(0))            
                    text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
                    tmpCtr += 1

    while True:
        repBlock = re.search('~#~(\d+)~#~', text)
        if repBlock == None:
            break
        text = text.replace(repBlock.group(0), tmpArray[int(repBlock.group(1))])
        
    text = text.replace("[/celeb]'s", "'s[/celeb]")
    text = text.replace("[/celeb]s", "s[/celeb]")
	
    for i in range(len(htmlBlocks)):
        text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i])
    
        
    return text

Example #13

Show file

File: cityToTeam.py Project: ctwiz/sourcereader

def getRelevantEntity():
    ###################################################################
    ##func: getRelevantEntity
    ##param: none (takes entities from infoModule.info.entityList)
    ##desc: determines missing team given city(s) and athlete(s), by
    ##      summing the relevance scores of story city(s)/all teams with
    ##      story athlete(s)/all teams.
    ##ret: cid of team with highest relevance
    ##auth: mdk
    ##################################################################

    # build lists of entities
    cityIds = []
    athleteIds = []

    for cid in infoModule.info.entityList.keys():
        cidType = entityLib.entityLibrary(cid, "celeb_type")
        if cidType == "Team":
            return False
        if cidType == "City":
            cityIds.append(cid)
        if cidType == "Athlete":
            athleteIds.append(cid)
    #        else:
    #            print cidType + " " + str(cid)

    if len(cityIds) == 0:
        return False

    if len(athleteIds) == 0:
        return False

    log.plog("City cids: ")
    for id in cityIds:
        log.plog(id, 2)
    log.plog("Athlete cids: ")
    for id in athleteIds:
        log.plog(id, 2)

    athleteDict = {}
    cityDict = {}

    if len(cityIds) > 1:
        cityidIndexCnt = len(cityIds)
    else:
        cityidIndexCnt = 1
    cityIterator = 1
    while cityIterator <= cityidIndexCnt:
        for id in cityIds:
            cityDict[cityIterator] = {}
            # create dict of team => relevance
            sql = (
                "SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = "
                + id
                + " AND mptype_id = 75"
            )
            cityTeamRelQ = mysql_tools.mysqlQuery(sql, infoModule.info.site["dblink"])
            while 1:
                cityTeamRel = cityTeamRelQ.fetch_row(1, 1)
                if cityTeamRel == ():
                    break
                key = int(cityTeamRel[0]["cid_2"])
                value = int(cityTeamRel[0]["relevance"])
                cityDict[cityIterator][key] = value
            cityIterator += 1

    for id in athleteIds:
        sql = (
            "SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = "
            + id
            + " AND mptype_id = 75"
        )
        athleteTeamRelQ = mysql_tools.mysqlQuery(sql, infoModule.info.site["dblink"])
        athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1)
        while 1:
            athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1)
            if athleteTeamRel == ():
                break
            key = int(athleteTeamRel[0]["cid_2"])
            value = int(athleteTeamRel[0]["relevance"])
            if key in athleteDict:
                athleteDict[key] += value
            else:
                athleteDict[key] = value

    if not athleteDict and not cityDict:
        return False
    # add cityDict values to athleteDict values to get one master dict of team => relevance
    if cityDict and athleteDict:
        cityIterator = 1
        while cityIterator <= cityidIndexCnt:
            for key, value in cityDict[cityIterator].items():
                if key in athleteDict:
                    athleteDict[key] += cityDict[cityIterator][key]
                else:
                    athleteDict[key] = value
            sortedFinalDict = sorted(athleteDict.iteritems(), key=operator.itemgetter(1), reverse=True)
            log.plog("dict of team => relevance: ", 2)
            for key, value in sortedFinalDict.items():
                log.plog(str(key) + " => " + str(value))
            return sortedFinalDict[0]