Example #1
0
def getEntities(searchText, title, jsonOut=True, byID=False):
    infoModule.info.page['outline'] = searchText
    infoModule.info.page['title'] = title

    entities.entityFinder(title + ' ' + searchText, True)
    entities.nicknameFinder(title + ' ' + searchText, True, True)
    entities.setPrimo()
    #pprint.pprint(infoModule.info.entityList)
    
    entityList = infoModule.info.entityList
    if jsonOut == True:
        #res = json.dumps(infoModule.info.entityList)
        #pprint.pprint(res)
        if len(entityList) > 0:
            ents = entityList.keys()

            # hacky JSON building; python's json module outputs slightly different than what we need
            res = '['
            for ids in ents:
                entityName = entityLib.entityLibrary(ids, 'entityName')
                celeb_type = entityLib.entityLibrary(ids, 'celeb_type')
                linkPath   = entityLib.entityLibrary(ids, 'linkPath')
                entityURL   = entityLib.entityLibrary(ids, 'lookupUrl')
                if byID:
                    #swap URL for ID
                    entityURL = str(ids)
            
                if celeb_type != 'hidden':
                    if 'nameUsed' in entityList[ids]:
                        #this means there's a nickname response
                        res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '","nameUsed":"' + entityList[ids]['nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},'
                    else:
                        res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},'
            res = res[:-1] + ']'
            #catch for nothing but hiddens
            if res == ']':
                res = ''
            #pprint.pprint(res)
        else:
            res = ''
    return res
def getTeamFromCity():
    ###################################################################
    ##func: getRelevantEntity
    ##param: none (takes entities from infoModule.info.entityList)
    ##desc: determines missing team given city(s) and athlete(s), by
    ##      summing the relevance scores of story city(s)/all teams with
    ##      story athlete(s)/all teams.
    ##ret: cid of team with highest relevance
    ##auth: mdk
    ##################################################################

    cityIds = []
    athleteIds = []

    for cid in infoModule.info.entityList.keys():
        cidType = entityLib.entityLibrary(cid, 'celeb_type')
        if cidType == 'Team':
            return False
        if cidType == 'City':
            cityIds.append(cid)
        elif cidType == 'Athlete':
            athleteIds.append(cid)
#        else:
#            print cidType + " " + str(cid)

# exit if no cities identified
    if len(cityIds) == 0:
        return False

    # exit if no athletes identified
    if len(athleteIds) == 0:
        return False

    #log.plog('cityIds: ' + cityIds + 'athleteIds: ' + athleteIds, 2)

    athleteDict = {}
    cityDict = {}

    if len(cityIds) > 1:
        cityidIndexCnt = len(cityIds)
    else:
        cityidIndexCnt = 1
    cityIterator = 1
    while cityIterator <= cityidIndexCnt:
        for id in cityIds:
            cityDict[cityIterator] = {}
            # create dict of team => relevance
            sql = 'SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = ' + id + ' AND mptype_id = 75'
            cityTeamRelQ = mysql_tools.mysqlQuery(
                sql, infoModule.info.site['dblink'])
            while (1):
                cityTeamRel = cityTeamRelQ.fetch_row(1, 1)
                if cityTeamRel == ():
                    break
                key = int(cityTeamRel[0]['cid_2'])
                value = int(cityTeamRel[0]['relevance'])
                cityDict[cityIterator][key] = value
            cityIterator += 1

    for id in athleteIds:
        sql = 'SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = ' + id + ' AND mptype_id = 75'
        athleteTeamRelQ = mysql_tools.mysqlQuery(
            sql, infoModule.info.site['dblink'])
        athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1)
        while (1):
            athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1)
            if athleteTeamRel == ():
                break
            key = int(athleteTeamRel[0]['cid_2'])
            value = int(athleteTeamRel[0]['relevance'])
            if key in athleteDict:
                athleteDict[key] += value
            else:
                athleteDict[key] = value

    if not athleteDict and not cityDict:
        return False
    # add cityDict values to athleteDict values to get one master dict of team => relevance
    if cityDict and athleteDict:
        cityIterator = 1
        while cityIterator <= cityidIndexCnt:
            for key, value in cityDict[cityIterator].items():
                if key in athleteDict:
                    athleteDict[key] += cityDict[cityIterator][key]
                else:
                    athleteDict[key] = value
            sortedFinalDict = sorted(athleteDict.iteritems(),
                                     key=operator.itemgetter(1),
                                     reverse=True)
            log.plog('picked team =>  ' + str(sortedFinalDict[0]), 2)
            return sortedFinalDict[0]
Example #3
0
def addMicrodata(text, entities, fullNames=False):

    # mptypes by category
    organizations = [37, 43, 44, 48, 49, 66, 76, 79, 104, 110, 120]

    if len(entities) == 0:
        log.plog("no entities passed to addMicrodata", 5)
        return text

    if (text == None or text == ''):
        log.plog("no text passed to addMicrodata", 5)
        return text

    #celebList should be sorted from long to short
    entityNameArray = []

    for entity in entities:
        entityNameArray.append(
            [entity,
             len(entityLib.entityLibrary(entity, 'entityName'))])

    entityNameArray = sorted(entityNameArray,
                             key=lambda nameLen: nameLen[1],
                             reverse=True)

    htmlBlocks = []
    #set aside all html so that celeb matches are not made inside
    ctr = 0
    while True:
        reres = re.search('<.*?>', text)
        if reres == None:
            break
        htmlBlocks.append(reres.group(0))
        text = text.replace(reres.group(0), "~*~%d~*~" % ctr)
        #        print"TEXT: " + text
        ctr += 1
    for entityTuple in entityNameArray:
        pprint.pprint(entityTuple)
        if entityLib.entityLibrary(entityTuple[0],
                                   'visibility') != 'invisible':
            name = entityLib.entityLibrary(entityTuple[0], 'entityName')
            name = name.strip()

            itemtype = itemprop = None
            # Person microdata
            if (name != '' and
                (entityLib.entityLibrary(entityTuple[0], 'human') == str(1))):
                itemtype = 'Person'
                itemprop = 'name'
            if int(entityLib.entityLibrary(entityTuple[0],
                                           'mptype_id')) in organizations:
                itemtype = 'Person'
                itemprop = 'affiliation'
            if itemtype and itemprop != None:
                itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                step1 = re.search(
                    "([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I)
                if step1 != None:
                    text = text.replace(
                        step1.group(0),
                        step1.group(1) + itemscope + '<span itemprop="' +
                        itemprop + '">' + name + '</span></span>' +
                        step1.group(2) + step1.group(3))
                step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text)
                if step2 != None:
                    text = text.replace(
                        step2.group(0), itemscope + '<span itemprop="' +
                        itemprop + '">' + name + '</span></span>' +
                        step2.group(1) + step2.group(2))
                step3 = re.search('\\b' + name + '$', text, re.I)
                if step3 != None:
                    text = text.replace(
                        step3.group(0), itemscope + '<span itemprop="' +
                        itemprop + '">' + name + '</span></span>')

    # now check by first and last name
    #in between passes, alter text to have ~(num)~ where the celeb blocks are
    tmpArray = []
    tmpCtr = 0
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    if fullNames == False:
        #only do first and last name if the fullNames (meaning require full name) isn't set to true
        while True:
            squareBlocks = re.search('<span itemscope.*?<\/span><\/span>',
                                     text)
            pprint.pprint(squareBlocks)
            if squareBlocks == None:
                break
            tmpArray.append(squareBlocks.group(0))
            text = text.replace(squareBlocks.group(0),
                                "~#~" + str(tmpCtr) + "~#~")
            tmpCtr += 1

        for entityTuple in entityNameArray:
            pprint.pprint(entityTuple)
            if entityLib.entityLibrary(entityTuple[0],
                                       'visibility') != 'invisible':
                lname = entityLib.entityLibrary(entityTuple[0], 'lname')
                if lname != None:
                    lname = lname.strip()
                fname = entityLib.entityLibrary(entityTuple[0], 'fname')
                if fname != None:
                    fname = fname.strip()

                itemtype = itemprop = None
                if (lname != '' and (entityLib.entityLibrary(
                        entityTuple[0], 'human') == str(1))):
                    itemtype = 'Person'
                    itemprop = 'name'
                if (fname != '' and (entityLib.entityLibrary(
                        entityTuple[0], 'human') == str(1))):
                    itemtype = 'Person'
                    itemprop = 'name'
                # affiliation microdata
                # Get mytype_id and see if res is in organization array
                if int(entityLib.entityLibrary(entityTuple[0],
                                               'mptype_id')) in organizations:
                    itemtype = 'Person'
                    itemprop = 'affiliation'
                if itemtype and itemprop != None:
                    itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                    # don't use first and last name if fname or lname is a number
                    try:
                        int(fname)
                    except ValueError:
                        if fname != None and fname != '':
                            step1 = re.search(
                                "([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}",
                                text, re.I)
                            if step1 != None:
                                text = text.replace(
                                    step1.group(0),
                                    step1.group(1) + itemscope +
                                    '<span itemprop="' + itemprop + '">' +
                                    fname + '</span></span>' + step1.group(2) +
                                    step1.group(3))
                            step2 = re.search(
                                "^" + fname + "(\'*s*)\\b([^\[]){1}", text)
                            if step2 != None:
                                text = text.replace(
                                    step2.group(0),
                                    itemscope + '<span itemprop="' + itemprop +
                                    '">' + fname + '</span></span>' +
                                    step2.group(1) + step2.group(2))
                            step3 = re.search('\\b' + fname + '$', text, re.I)
                            if step3 != None:
                                text = text.replace(
                                    step3.group(0),
                                    itemscope + '<span itemprop="' + itemprop +
                                    '">' + fname + '</span></span>')
                    try:
                        int(lname)
                    except ValueError:
                        if lname != None and lname != '':
                            step1 = re.search(
                                "([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}",
                                text, re.I)
                            if step1 != None:
                                text = text.replace(
                                    step1.group(0),
                                    step1.group(1) + itemscope +
                                    '<span itemprop="' + itemprop + '">' +
                                    lname + '</span></span>' + step1.group(2) +
                                    step1.group(3))
                            step2 = re.search(
                                "^" + lname + "(\'*s*)\\b([^\[]){1}", text)
                            if step2 != None:
                                text = text.replace(
                                    step2.group(0),
                                    itemscope + '<span itemprop="' + itemprop +
                                    '">' + lname + '</span></span>' +
                                    step2.group(1) + step2.group(2))
                            step3 = re.search('\\b' + lname + '$', text, re.I)
                            if step3 != None:
                                text = text.replace(
                                    step3.group(0),
                                    itemscope + '<span itemprop="' + itemprop +
                                    '">' + lname + '</span></span>')

    #nicknames, after the rest are done
    #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    while True:
        squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text)
        if squareBlocks == None:
            break
        tmpArray.append(squareBlocks.group(0))
        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
        tmpCtr += 1

    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0],
                                   'visibility') != 'invisible':
            itemtype = itemprop = None
            if entityLib.entityLibrary(entityTuple[0], 'human') == str(1):
                itemtype = 'Person'
                itemprop = 'name'
            # affiliation microdata
            # Get mytype_id and see if res is in organization array
            if int(entityLib.entityLibrary(entityTuple[0],
                                           'mptype_id')) in organizations:
                itemtype = 'Person'
                itemprop = 'affiliation'
            if itemtype and itemprop != None:
                itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                nicknamesQ = mysql_tools.mysqlQuery(
                    "select name, case_sensitive from db_topics.nicknames where cid_1="
                    + str(entityTuple[0]) + " or cid_2=" +
                    str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]),
                    infoModule.info.site['dblink'])
                while True:
                    nicknameRow = nicknamesQ.fetch_row(1, 1)
                    if nicknameRow == ():
                        break
                    if nicknameRow[0]['case_sensitive'] == 1:
                        nicknameMatch = re.search(
                            '\\b' + nicknameRow[0]['name'] + '\\b', text, re.I)
                    else:
                        nicknameMatch = re.search(
                            '\\b' + nicknameRow[0]['name'] + '\\b', text)
                    if nicknameMatch and itemtype != None:
                        text = text.replace(
                            nicknameMatch.group(0),
                            itemscope + '<span itemprop="' + itemprop + '">' +
                            nicknameMatch.group(0) + '</span></span>')

                    #take it out and store it off to prevent more dupes
                    while True:
                        squareBlocks = re.search(
                            '<span itemscope.*?<\/span><\/span>', text)
                        if squareBlocks == None:
                            break
                        tmpArray.append(squareBlocks.group(0))
                        text = text.replace(squareBlocks.group(0),
                                            "~#~" + str(tmpCtr) + "~#~")
                        tmpCtr += 1

    while True:
        repBlock = re.search('~#~(\d+)~#~', text)
        if repBlock == None:
            break
        text = text.replace(repBlock.group(0),
                            tmpArray[int(repBlock.group(1))])

#    text = text.replace("[/celeb]'s", "'s[/celeb]")
#    text = text.replace("[/celeb]s", "s[/celeb]")

    for i in range(len(htmlBlocks)):
        text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i])

    return text
Example #4
0
def setPrimo():
    ###################################################################
    ##func: setPrimo
    ##param: none
    ##desc: takes no params.  Instead, as long as page title, page text
    ##      and celebList are properly put together, it will rank the entities
    ##      for primo position
    ##ret: None
    ##auth: esr
    ##################################################################

    if 'outline' not in infoModule.info.page or infoModule.info.page[
            'outline'] == '':
        log.plog('no outline set before setPrimo called', 5)
        return

    if 'title' not in infoModule.info.page or infoModule.info.page[
            'title'] == '':
        log.plog('no title set before setPrimo called', 5)
        return

    if len(infoModule.info.entityList) == 0:
        log.plog('no entities set before setPrimo called', 5)
        return

    #highlight text as a way to count the frequency
    highlightedText = highlightEntities.highlightEntitiesFromList(
        infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'],
        infoModule.info.entityList)
    while True:
        taggedEntity = re.search('\[celeb url=(.+?)\]', highlightedText)
        if taggedEntity == None:
            break
        highlightedText = highlightedText.replace(taggedEntity.group(0), '', 1)
        lookupUrl = taggedEntity.group(1)
        sql = "select celeb_id from db_topics.celebs where lookupUrl='" + taggedEntity.group(
            1) + "'"
        entityByLookupQ = mysql_tools.mysqlQuery(
            sql, infoModule.info.site['dblink'])
        while (1):
            entityByLookup = entityByLookupQ.fetch_row(1, 1)
            if entityByLookup == ():
                break
            if infoModule.info.entityList[entityByLookup[0]
                                          ['celeb_id']]['frequency'] == 0:
                infoModule.info.entityList[entityByLookup[0]
                                           ['celeb_id']]['frequency'] = 1
            else:
                infoModule.info.entityList[entityByLookup[0]
                                           ['celeb_id']]['frequency'] += 1

    # now frequency is set, first two are based on position, next two based on frequency
    primoTypes = ['Y', '2', '3', '4', 'N']
    primoTypeCursor = 0

    #if source is associated with entity, it's auto-set to primo
    if 'celeb_id' in infoModule.info.source and infoModule.info.source[
            'celeb_id'] != '' and int(infoModule.info.source['celeb_id']) > 0:
        infoModule.info.entityList[infoModule.info.source['celeb_id']][
            'primo'] = primoTypes[primoTypeCursor]
        primoTypeCursor += 1

    posArray = []
    entKeys = infoModule.info.entityList.keys()
    for eKey in entKeys:
        if infoModule.info.entityList[eKey]['position'] != None:
            posArray.append(
                [eKey, infoModule.info.entityList[eKey]['position']])

    posArray.sort(key=lambda x: x[1])

    freqArray = []
    entKeys = infoModule.info.entityList.keys()
    for eKey in entKeys:
        if infoModule.info.entityList[eKey][
                'frequency'] != None and infoModule.info.entityList[eKey][
                    'frequency'] > 1:
            freqArray.append(
                [eKey, infoModule.info.entityList[eKey]['frequency']])

    freqArray.sort(key=lambda x: x[1], reverse=True)

    titleLen = len(infoModule.info.page['title'])
    #primo set by position in title
    ctr = 0
    while primoTypeCursor < 4 and len(posArray) > ctr:
        if posArray[ctr][1] > titleLen:
            break
        ## only non-hidden entities can be primo
        if entityLib.entityLibrary(posArray[ctr][0],
                                   'visibility') != 'invisible':
            log.plog(
                "entity %s found in title, setting to primo %s" %
                (posArray[ctr][0], primoTypes[primoTypeCursor]), 3)
            infoModule.info.entityList[
                posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
            primoTypeCursor += 1
        ctr += 1

    #primo set by frequency
    ctr = 0
    while primoTypeCursor < 4 and ctr < len(freqArray):
        if infoModule.info.entityList[freqArray[ctr][0]]['primo'] == 'N':
            ## only non-hidden entities can be primo
            if entityLib.entityLibrary(freqArray[ctr][0],
                                       'visibility') != 'invisible':
                log.plog(
                    "entity %s has high frequency, setting to primo %s" %
                    (freqArray[ctr][0], primoTypes[primoTypeCursor]), 3)
                infoModule.info.entityList[
                    freqArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
                primoTypeCursor += 1
        ctr += 1

    #if nothing found, go to first position and set it to primo
    if primoTypeCursor == 0:
        ctr = 0
        while len(posArray) > ctr:
            ## only non-hidden entities can be primo
            if entityLib.entityLibrary(posArray[ctr][0],
                                       'visibility') != 'invisible':
                log.plog(
                    "entity %s found first in story, setting to primo %s" %
                    (posArray[ctr][0], primoTypes[primoTypeCursor]), 3)
                infoModule.info.entityList[
                    posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
                primoTypeCursor += 1
                break
            ctr += 1
Example #5
0
link = mysql_tools.mysqlConnect()
if link == False :
    print "no connection"
    sys.exit(0)
    
infoModule.info.site['dblink'] = link	
infoModule.info.site['log_priority_threshold'] = 3	

if len(sys.argv) > 1 and int(sys.argv[1]) > 0:
    #get specific sub_id
    print "looking up entity_id: " + sys.argv[1]
    cid = sys.argv[1]

    print "%s:" % cid
    print "\t%s" % entityLib.entityLibrary(int(cid), 'entityName')
    print "\t%s" % entityLib.entityLibrary(int(cid), 'lookupUrl')
    print "\t%s" % entityLib.entityLibrary(int(cid), 'bio')


else:
    sql = "select celeb_id from db_topics.celebs order by rand() limit 20"
    er = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
    
    if er == False:
        print "select random entities failed"
        sys.exit(0)
    
    
    while (1):
        row=er.fetch_row(1,1)
Example #6
0
def setPrimo():       
    ###################################################################
    ##func: setPrimo
    ##param: none
    ##desc: takes no params.  Instead, as long as page title, page text
    ##      and celebList are properly put together, it will rank the entities
    ##      for primo position
    ##ret: None
    ##auth: esr
    ##################################################################
    
    if 'outline' not in infoModule.info.page or infoModule.info.page['outline'] == '':
        log.plog('no outline set before setPrimo called', 5)
        return

    if 'title' not in infoModule.info.page or infoModule.info.page['title'] == '':
        log.plog('no title set before setPrimo called', 5)
        return
        
    if len(infoModule.info.entityList) == 0:
        log.plog('no entities set before setPrimo called', 5)
        return
        
    #highlight text as a way to count the frequency
    highlightedText = highlightEntities.highlightEntitiesFromList(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], infoModule.info.entityList)
    while True:
        taggedEntity = re.search('\[celeb url=(.+?)\]', highlightedText)
        if taggedEntity == None:
            break
        highlightedText = highlightedText.replace(taggedEntity.group(0), '', 1)
        lookupUrl = taggedEntity.group(1)
        sql = "select celeb_id from db_topics.celebs where lookupUrl='" + taggedEntity.group(1) + "'"
        entityByLookupQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])  
        while (1):
            entityByLookup=entityByLookupQ.fetch_row(1,1)
            if entityByLookup == ():
                break
            if infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] == 0:
                infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] = 1
            else:
                infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] += 1
                
    # now frequency is set, first two are based on position, next two based on frequency
    primoTypes = ['Y', '2', '3', '4', 'N']
    primoTypeCursor = 0
    
    #if source is associated with entity, it's auto-set to primo
    if 'celeb_id' in infoModule.info.source and infoModule.info.source['celeb_id'] != '' and int(infoModule.info.source['celeb_id']) > 0:
        infoModule.info.entityList[infoModule.info.source['celeb_id']]['primo'] = primoTypes[primoTypeCursor]
        primoTypeCursor += 1


    posArray = []
    entKeys = infoModule.info.entityList.keys()
    for eKey in entKeys:
        if infoModule.info.entityList[eKey]['position'] != None:
            posArray.append([eKey, infoModule.info.entityList[eKey]['position'] ])
    
    posArray.sort(key=lambda x: x[1])
    
    freqArray = []
    entKeys = infoModule.info.entityList.keys()
    for eKey in entKeys:
        if infoModule.info.entityList[eKey]['frequency'] != None and infoModule.info.entityList[eKey]['frequency'] > 1:
            freqArray.append([eKey, infoModule.info.entityList[eKey]['frequency'] ])

    freqArray.sort(key=lambda x: x[1], reverse=True)
    
    titleLen = len(infoModule.info.page['title'])
    #primo set by position in title
    ctr = 0
    while primoTypeCursor < 4 and len(posArray) > ctr:
        if posArray[ctr][1] > titleLen:
            break
        ## only non-hidden entities can be primo
        if entityLib.entityLibrary(posArray[ctr][0], 'visibility') != 'invisible':
            log.plog("entity %s found in title, setting to primo %s" % (posArray[ctr][0], primoTypes[primoTypeCursor]), 3)
            infoModule.info.entityList[posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
            primoTypeCursor += 1
        ctr += 1 

    #primo set by frequency
    ctr = 0
    while primoTypeCursor < 4 and ctr < len(freqArray):
        if infoModule.info.entityList[freqArray[ctr][0]]['primo'] == 'N':
            ## only non-hidden entities can be primo
            if entityLib.entityLibrary(freqArray[ctr][0], 'visibility') != 'invisible':
                log.plog("entity %s has high frequency, setting to primo %s" % (freqArray[ctr][0], primoTypes[primoTypeCursor]), 3)
                infoModule.info.entityList[freqArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
                primoTypeCursor += 1
        ctr += 1 
        
    #if nothing found, go to first position and set it to primo
    if primoTypeCursor == 0:
        ctr = 0
        while len(posArray) > ctr:
            ## only non-hidden entities can be primo
            if entityLib.entityLibrary(posArray[ctr][0], 'visibility') != 'invisible':
                log.plog("entity %s found first in story, setting to primo %s" % (posArray[ctr][0], primoTypes[primoTypeCursor]), 3)
                infoModule.info.entityList[posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor]
                primoTypeCursor += 1
                break
            ctr += 1 
Example #7
0
def highlightEntitiesFromList(text, entities, fullNames=False):
    ###################################################################
    ##func: highlightEntitiesFromList
    ##param: text, entities
    ##desc: takes a list of entities (id numbers) and finds all name matches
    ##      and nickname matches and wraps those in the text with
    ##      [celeb url="celeb-url"][/celeb]
    ##      in the source_reader context, it will probably be phased out
    ##      its current usage is to mark up the outline of a page and to
    ##      be a tool for determining the frequency with which entities appear
    ##ret: string
    ##auth: esr
    ##################################################################

    if len(entities) == 0:
        log.plog("no entities passed to highlightEntitiesFromList", 5)
        return text

    if (text == None or text == ''):
        log.plog("no text passed to highlightEntitiesFromList", 5)
        return text

    #celebList should be sorted from long to short
    entityNameArray = []

    for entity in entities:
        entityNameArray.append(
            [entity,
             len(entityLib.entityLibrary(entity, 'entityName'))])

    entityNameArray = sorted(entityNameArray,
                             key=lambda nameLen: nameLen[1],
                             reverse=True)

    htmlBlocks = []
    #set aside all html so that celeb matches are not made inside
    ctr = 0
    while True:
        reres = re.search('<.*?>', text)
        if reres == None:
            break
        htmlBlocks.append(reres.group(0))
        text = text.replace(reres.group(0), "~*~%d~*~" % ctr)
        ctr += 1

    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0],
                                   'visibility') != 'invisible':
            name = entityLib.entityLibrary(entityTuple[0], 'entityName')
            name = name.strip()
            if name != '':
                lookupUrl = entityLib.entityLibrary(entityTuple[0],
                                                    'lookupUrl')
                step1 = re.search(
                    "([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I)
                if step1 != None:
                    text = text.replace(
                        step1.group(0),
                        step1.group(1) + "[celeb url=" + lookupUrl + "]" +
                        name + "[/celeb]" + step1.group(2) + step1.group(3))
                step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text)
                if step2 != None:
                    text = text.replace(
                        step2.group(0), "[celeb url=" + lookupUrl + "]" +
                        name + "[/celeb]" + step2.group(1) + step2.group(2))

                step3 = re.search('\\b' + name + '$', text, re.I)
                if step3 != None:
                    text = text.replace(
                        step3.group(0),
                        "[celeb url=" + lookupUrl + "]" + name + "[/celeb]")

    # now check by first and last name
    #in between passes, alter text to have ~(num)~ where the celeb blocks are
    tmpArray = []
    tmpCtr = 0
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    if fullNames == False:
        #only do first and last name if the fullNames (meaning require full name) isn't set to true
        while True:
            squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
            if squareBlocks == None:
                break
            tmpArray.append(squareBlocks.group(0))
            text = text.replace(squareBlocks.group(0),
                                "~#~" + str(tmpCtr) + "~#~")
            tmpCtr += 1

        for entityTuple in entityNameArray:
            if entityLib.entityLibrary(entityTuple[0],
                                       'visibility') != 'invisible':
                lname = entityLib.entityLibrary(entityTuple[0], 'lname')
                if lname != None:
                    lname = lname.strip()
                fname = entityLib.entityLibrary(entityTuple[0], 'fname')
                if fname != None:
                    fname = fname.strip()
                lookupUrl = entityLib.entityLibrary(entityTuple[0],
                                                    'lookupUrl')
                # don't use first and last name if fname or lname is a number
                try:
                    int(fname)
                except ValueError:
                    if fname != None and fname != '':
                        step1 = re.search(
                            "([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}",
                            text, re.I)
                        if step1 != None:
                            text = text.replace(
                                step1.group(0),
                                step1.group(1) + "[celeb url=" + lookupUrl +
                                "]" + fname + "[/celeb]" + step1.group(2) +
                                step1.group(3))
                        step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}",
                                          text)
                        if step2 != None:
                            text = text.replace(
                                step2.group(0),
                                "[celeb url=" + lookupUrl + "]" + fname +
                                "[/celeb]" + step2.group(1) + step2.group(2))

                        step3 = re.search('\\b' + fname + '$', text, re.I)
                        if step3 != None:
                            text = text.replace(
                                step3.group(0), "[celeb url=" + lookupUrl +
                                "]" + lname + "[/celeb]")
                try:
                    int(lname)
                except ValueError:
                    if lname != None and lname != '':
                        step1 = re.search(
                            "([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}",
                            text, re.I)
                        if step1 != None:
                            text = text.replace(
                                step1.group(0),
                                step1.group(1) + "[celeb url=" + lookupUrl +
                                "]" + lname + "[/celeb]" + step1.group(2) +
                                step1.group(3))
                        step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}",
                                          text)
                        if step2 != None:
                            text = text.replace(
                                step2.group(0),
                                "[celeb url=" + lookupUrl + "]" + lname +
                                "[/celeb]" + step2.group(1) + step2.group(2))

                        step3 = re.search('\\b' + lname + '$', text, re.I)
                        if step3 != None:
                            text = text.replace(
                                step3.group(0), "[celeb url=" + lookupUrl +
                                "]" + lname + "[/celeb]")

    #nicknames, after the rest are done
    #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    while True:
        squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
        if squareBlocks == None:
            break
        tmpArray.append(squareBlocks.group(0))
        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
        tmpCtr += 1

    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0],
                                   'visibility') != 'invisible':
            lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl')
            nicknamesQ = mysql_tools.mysqlQuery(
                "select name, case_sensitive from db_topics.nicknames where cid_1="
                + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) +
                " or cid_3=" + str(entityTuple[0]),
                infoModule.info.site['dblink'])
            while True:
                nicknameRow = nicknamesQ.fetch_row(1, 1)
                if nicknameRow == ():
                    break
                if nicknameRow[0]['case_sensitive'] == 1:
                    nicknameMatch = re.search(
                        '\\b' + nicknameRow[0]['name'] + '\\b', text, re.I)
                else:
                    nicknameMatch = re.search(
                        '\\b' + nicknameRow[0]['name'] + '\\b', text)
                if nicknameMatch != None:
                    text = text.replace(
                        nicknameMatch.group(0), "[celeb url=" + lookupUrl +
                        "]" + nicknameMatch.group(0) + "[/celeb]")

                #take it out and store it off to prevent more dupes
                while True:
                    squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
                    if squareBlocks == None:
                        break
                    tmpArray.append(squareBlocks.group(0))
                    text = text.replace(squareBlocks.group(0),
                                        "~#~" + str(tmpCtr) + "~#~")
                    tmpCtr += 1

    while True:
        repBlock = re.search('~#~(\d+)~#~', text)
        if repBlock == None:
            break
        text = text.replace(repBlock.group(0),
                            tmpArray[int(repBlock.group(1))])

    text = text.replace("[/celeb]'s", "'s[/celeb]")
    text = text.replace("[/celeb]s", "s[/celeb]")

    for i in range(len(htmlBlocks)):
        text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i])

    return text

def getIds(sub_id):
    sql = 'SELECT celeb_id FROM ' + infoModule.info.site[
        'database'] + '.subs_celebs WHERE sub_id = ' + str(sub_id)
    entityIdsQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
    entityIds = entityIdsQ.fetch_row(0, 1)

    entityRows = []
    for row in entityIds:
        infoModule.info.entityList[row['celeb_id']] = {
            'position': None,
            'frequency': 0,
            'primo': 'N'
        }


if (1):
    # http://dev.sportifi.com/news/Carmelo-about-to-put-fans-in-an-awkward-position-52139.html
    getIds(52139)

    # delete team for testing purposes
    del infoModule.info.entityList['853957']

    for cid in infoModule.info.entityList.keys():
        cidType = entityLib.entityLibrary(cid, 'celeb_type')
        print str(cid) + " - " + cidType

    result = getRelevantEntity()
    print result
Example #9
0
def getEntities(searchText, title, jsonOut=True, byID=False):
    infoModule.info.page['outline'] = searchText
    infoModule.info.page['title'] = title

    entities.entityFinder(title + ' ' + searchText, True)
    entities.nicknameFinder(title + ' ' + searchText, True, True)
    entities.setPrimo()
    #pprint.pprint(infoModule.info.entityList)

    entityList = infoModule.info.entityList
    if jsonOut == True:
        #res = json.dumps(infoModule.info.entityList)
        #pprint.pprint(res)
        if len(entityList) > 0:
            ents = entityList.keys()

            # hacky JSON building; python's json module outputs slightly different than what we need
            res = '['
            for ids in ents:
                entityName = entityLib.entityLibrary(ids, 'entityName')
                celeb_type = entityLib.entityLibrary(ids, 'celeb_type')
                linkPath = entityLib.entityLibrary(ids, 'linkPath')
                entityURL = entityLib.entityLibrary(ids, 'lookupUrl')
                if byID:
                    #swap URL for ID
                    entityURL = str(ids)

                if celeb_type != 'hidden':
                    if 'nameUsed' in entityList[ids]:
                        #this means there's a nickname response
                        res = res + '{"id":"' + str(
                            entityURL
                        ) + '","name":"' + str(
                            entityName
                        ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(
                            celeb_type
                        ) + '","nameUsed":"' + entityList[ids][
                            'nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(
                                linkPath
                            ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(
                                linkPath
                            ) + '","icon":"http://informifi.com/fi_icon.png"}]},'
                    else:
                        res = res + '{"id":"' + str(
                            entityURL
                        ) + '","name":"' + str(
                            entityName
                        ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(
                            celeb_type
                        ) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(
                            linkPath
                        ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(
                            linkPath
                        ) + '","icon":"http://informifi.com/fi_icon.png"}]},'
            res = res[:-1] + ']'
            #catch for nothing but hiddens
            if res == ']':
                res = ''
            #pprint.pprint(res)
        else:
            res = ''
    return res
Example #10
0
def addMicrodata(text, entities, fullNames = False):

    # mptypes by category
    organizations = [37,43,44,48,49,66,76,79,104,110,120]
    
    if len(entities) == 0:
        log.plog("no entities passed to addMicrodata", 5)
        return text

    if (text == None or text == ''):
        log.plog("no text passed to addMicrodata", 5)
        return text
        
    #celebList should be sorted from long to short
    entityNameArray=[]
   
    for entity in entities:
        entityNameArray.append( [entity, len(entityLib.entityLibrary(entity, 'entityName'))] )
    
    entityNameArray = sorted(entityNameArray, key=lambda nameLen: nameLen[1], reverse=True)
    
    htmlBlocks = []
    #set aside all html so that celeb matches are not made inside
    ctr = 0
    while True:
        reres = re.search('<.*?>', text)
        if reres == None:
            break
        htmlBlocks.append(reres.group(0))
        text = text.replace(reres.group(0), "~*~%d~*~" % ctr)
#        print"TEXT: " + text
        ctr += 1
    for entityTuple in entityNameArray:
        pprint.pprint(entityTuple)
        if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
            name = entityLib.entityLibrary(entityTuple[0], 'entityName')
            name = name.strip()
            
            itemtype = itemprop = None
            # Person microdata
            if ( name != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ):
                itemtype = 'Person'
                itemprop = 'name'            
            if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations:
                itemtype = 'Person'
                itemprop = 'affiliation'
            if itemtype and itemprop != None:
                itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                step1 = re.search("([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I)
                if step1 != None:
                    text = text.replace(step1.group(0), step1.group(1) + itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>' + step1.group(2) + step1.group(3))
                step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text)
                if step2 != None:
                    text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>' + step2.group(1) + step2.group(2))
                step3 = re.search('\\b' + name + '$', text, re.I)
                if step3 != None:
                    text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>')	

    # now check by first and last name
    #in between passes, alter text to have ~(num)~ where the celeb blocks are
    tmpArray = []
    tmpCtr = 0
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    if fullNames == False:
        #only do first and last name if the fullNames (meaning require full name) isn't set to true
        while True:
            squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text)
            pprint.pprint(squareBlocks)
            if squareBlocks == None:
                break
            tmpArray.append(squareBlocks.group(0))            
            text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
            tmpCtr += 1
    
        for entityTuple in entityNameArray:
            pprint.pprint(entityTuple)
            if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
                lname = entityLib.entityLibrary(entityTuple[0], 'lname')
                if lname != None:
                    lname = lname.strip()
                fname = entityLib.entityLibrary(entityTuple[0], 'fname')
                if fname != None:
                    fname = fname.strip()
                
                itemtype = itemprop = None
                if ( lname != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ):
                    itemtype = 'Person'
                    itemprop = 'name'
                if ( fname != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ):
                    itemtype = 'Person'
                    itemprop = 'name'            
                # affiliation microdata
                # Get mytype_id and see if res is in organization array
                if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations:
                    itemtype = 'Person'
                    itemprop = 'affiliation'
                if itemtype and itemprop != None:
                    itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''
                    # don't use first and last name if fname or lname is a number
                    try:
                        int(fname)
                    except ValueError:
                        if fname != None and fname != '':
                            step1 = re.search("([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}", text, re.I)
                            if step1 != None:
                                text = text.replace(step1.group(0), step1.group(1) +  itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>' + step1.group(2) + step1.group(3))
                            step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}", text)
                            if step2 != None:
                                 text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>' + step2.group(1) + step2.group(2))    
                            step3 = re.search('\\b' + fname + '$', text, re.I)
                            if step3 != None:
                                text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>')		
                    try:
                        int(lname)
                    except ValueError:
                        if lname != None and lname != '':
                            step1 = re.search("([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}", text, re.I)
                            if step1 != None:
                                text = text.replace(step1.group(0), step1.group(1) +  itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>' + step1.group(2) + step1.group(3))
                            step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}", text)
                            if step2 != None:
                                text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>' + step2.group(1) + step2.group(2))
                            step3 = re.search('\\b' + lname + '$', text, re.I)
                            if step3 != None:
                                text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>')		

    #nicknames, after the rest are done
    #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    while True:
        squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text)
        if squareBlocks == None:
            break
        tmpArray.append(squareBlocks.group(0))            
        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
        tmpCtr += 1
    
    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
            itemtype = itemprop = None
            if entityLib.entityLibrary(entityTuple[0], 'human') == str(1):
                itemtype = 'Person'
                itemprop = 'name'           
            # affiliation microdata
            # Get mytype_id and see if res is in organization array
            if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations:
                itemtype = 'Person'
                itemprop = 'affiliation'
            if itemtype and itemprop != None:
                itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">'''            
                nicknamesQ = mysql_tools.mysqlQuery("select name, case_sensitive from db_topics.nicknames where cid_1=" + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]) , infoModule.info.site['dblink'])
                while True:
                    nicknameRow=nicknamesQ.fetch_row(1,1)
                    if nicknameRow == ():
                        break
                    if nicknameRow[0]['case_sensitive'] == 1:
                        nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text, re.I)
                    else:
                        nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text)
                    if nicknameMatch and itemtype != None:
                        text = text.replace(nicknameMatch.group(0), itemscope + '<span itemprop="' + itemprop + '">' + nicknameMatch.group(0) + '</span></span>')
                
                    #take it out and store it off to prevent more dupes
                    while True:
                        squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text)
                        if squareBlocks == None:
                            break
                        tmpArray.append(squareBlocks.group(0))            
                        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
                        tmpCtr += 1

    while True:
        repBlock = re.search('~#~(\d+)~#~', text)
        if repBlock == None:
            break
        text = text.replace(repBlock.group(0), tmpArray[int(repBlock.group(1))])
        
#    text = text.replace("[/celeb]'s", "'s[/celeb]")
#    text = text.replace("[/celeb]s", "s[/celeb]")
	
    for i in range(len(htmlBlocks)):
        text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i])
            
    return text
Example #11
0
	sys.exit(0)
	
# set globals for site
infoModule.info.site['database'] = 'db_sportifi'
infoModule.info.site['dblink']   = link

def getIds(sub_id):
    sql = 'SELECT celeb_id FROM ' + infoModule.info.site['database'] + '.subs_celebs WHERE sub_id = ' + str(sub_id)
    entityIdsQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) 
    entityIds = entityIdsQ.fetch_row(0,1)

    entityRows = []
    for row in entityIds:
        infoModule.info.entityList[row['celeb_id']] = {'position': None, 'frequency': 0, 'primo' : 'N'}

if (1):
    # http://dev.sportifi.com/news/Carmelo-about-to-put-fans-in-an-awkward-position-52139.html
    getIds(52139)
    
    # delete team for testing purposes
    del infoModule.info.entityList['853957']

    for cid in infoModule.info.entityList.keys():
        cidType = entityLib.entityLibrary(cid,'celeb_type')
        print str(cid) + " - " + cidType

    result = getRelevantEntity()
    print result


Example #12
0
def highlightEntitiesFromList(text, entities, fullNames = False):
    ###################################################################
    ##func: highlightEntitiesFromList
    ##param: text, entities
    ##desc: takes a list of entities (id numbers) and finds all name matches
    ##      and nickname matches and wraps those in the text with 
    ##      [celeb url="celeb-url"][/celeb]
    ##      in the source_reader context, it will probably be phased out
    ##      its current usage is to mark up the outline of a page and to
    ##      be a tool for determining the frequency with which entities appear
    ##ret: string
    ##auth: esr
    ##################################################################
    
    if len(entities) == 0:
        log.plog("no entities passed to highlightEntitiesFromList", 5)
        return text

    if (text == None or text == ''):
        log.plog("no text passed to highlightEntitiesFromList", 5)
        return text
        
    #celebList should be sorted from long to short
    entityNameArray=[]
   
    for entity in entities:
        entityNameArray.append( [entity, len(entityLib.entityLibrary(entity, 'entityName'))] )
    
    entityNameArray = sorted(entityNameArray, key=lambda nameLen: nameLen[1], reverse=True)
    
    htmlBlocks = []
    #set aside all html so that celeb matches are not made inside
    ctr = 0
    while True:
        reres = re.search('<.*?>', text)
        if reres == None:
            break
        htmlBlocks.append(reres.group(0))
        text = text.replace(reres.group(0), "~*~%d~*~" % ctr)
        ctr += 1
    
    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
            name = entityLib.entityLibrary(entityTuple[0], 'entityName')
            name = name.strip()
            if name != '':
                lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl')
                step1 = re.search("([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I)
                if step1 != None:
                    text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + name + "[/celeb]" + step1.group(2) + step1.group(3))
                step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text)
                if step2 != None:
                    text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + name + "[/celeb]" + step2.group(1) + step2.group(2))
    
                step3 = re.search('\\b' + name + '$', text, re.I)
                if step3 != None:
                    text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + name + "[/celeb]")		

    # now check by first and last name
    #in between passes, alter text to have ~(num)~ where the celeb blocks are
    tmpArray = []
    tmpCtr = 0
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    if fullNames == False:
        #only do first and last name if the fullNames (meaning require full name) isn't set to true
        while True:
            squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
            if squareBlocks == None:
                break
            tmpArray.append(squareBlocks.group(0))            
            text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
            tmpCtr += 1
    
        for entityTuple in entityNameArray:
            if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
                lname = entityLib.entityLibrary(entityTuple[0], 'lname')
                if lname != None:
                    lname = lname.strip()
                fname = entityLib.entityLibrary(entityTuple[0], 'fname')
                if fname != None:
                    fname = fname.strip()
                lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl')
                # don't use first and last name if fname or lname is a number
                try:
                    int(fname)
                except ValueError:
                    if fname != None and fname != '':
                        step1 = re.search("([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}", text, re.I)
                        if step1 != None:
                            text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + fname + "[/celeb]" + step1.group(2) + step1.group(3))
                        step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}", text)
                        if step2 != None:
                            text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + fname + "[/celeb]" + step2.group(1) + step2.group(2))
    
                        step3 = re.search('\\b' + fname + '$', text, re.I)
                        if step3 != None:
                            text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]")		
                try:
                    int(lname)
                except ValueError:
                    if lname != None and lname != '':
                        step1 = re.search("([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}", text, re.I)
                        if step1 != None:
                            text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]" + step1.group(2) + step1.group(3))
                        step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}", text)
                        if step2 != None:
                            text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]" + step2.group(1) + step2.group(2))
    
                        step3 = re.search('\\b' + lname + '$', text, re.I)
                        if step3 != None:
                            text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]")		

    #nicknames, after the rest are done
    #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are
    # the num points to the array containing the bit so that it can be rebuilt after nicknames are run.
    while True:
        squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
        if squareBlocks == None:
            break
        tmpArray.append(squareBlocks.group(0))            
        text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
        tmpCtr += 1
    
    for entityTuple in entityNameArray:
        if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible':
            lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl')
            nicknamesQ = mysql_tools.mysqlQuery("select name, case_sensitive from db_topics.nicknames where cid_1=" + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]) , infoModule.info.site['dblink'])
            while True:
                nicknameRow=nicknamesQ.fetch_row(1,1)
                if nicknameRow == ():
                    break
                if nicknameRow[0]['case_sensitive'] == 1:
                    nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text, re.I)
                else:
                    nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text)
                if nicknameMatch != None:
                    text = text.replace(nicknameMatch.group(0), "[celeb url=" + lookupUrl + "]" + nicknameMatch.group(0) + "[/celeb]")

                
                #take it out and store it off to prevent more dupes
                while True:
                    squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text)
                    if squareBlocks == None:
                        break
                    tmpArray.append(squareBlocks.group(0))            
                    text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~")
                    tmpCtr += 1

    while True:
        repBlock = re.search('~#~(\d+)~#~', text)
        if repBlock == None:
            break
        text = text.replace(repBlock.group(0), tmpArray[int(repBlock.group(1))])
        
    text = text.replace("[/celeb]'s", "'s[/celeb]")
    text = text.replace("[/celeb]s", "s[/celeb]")
	
    for i in range(len(htmlBlocks)):
        text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i])
    
        
    return text
Example #13
0
def getRelevantEntity():
    ###################################################################
    ##func: getRelevantEntity
    ##param: none (takes entities from infoModule.info.entityList)
    ##desc: determines missing team given city(s) and athlete(s), by
    ##      summing the relevance scores of story city(s)/all teams with
    ##      story athlete(s)/all teams.
    ##ret: cid of team with highest relevance
    ##auth: mdk
    ##################################################################

    # build lists of entities
    cityIds = []
    athleteIds = []

    for cid in infoModule.info.entityList.keys():
        cidType = entityLib.entityLibrary(cid, "celeb_type")
        if cidType == "Team":
            return False
        if cidType == "City":
            cityIds.append(cid)
        if cidType == "Athlete":
            athleteIds.append(cid)
    #        else:
    #            print cidType + " " + str(cid)

    if len(cityIds) == 0:
        return False

    if len(athleteIds) == 0:
        return False

    log.plog("City cids: ")
    for id in cityIds:
        log.plog(id, 2)
    log.plog("Athlete cids: ")
    for id in athleteIds:
        log.plog(id, 2)

    athleteDict = {}
    cityDict = {}

    if len(cityIds) > 1:
        cityidIndexCnt = len(cityIds)
    else:
        cityidIndexCnt = 1
    cityIterator = 1
    while cityIterator <= cityidIndexCnt:
        for id in cityIds:
            cityDict[cityIterator] = {}
            # create dict of team => relevance
            sql = (
                "SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = "
                + id
                + " AND mptype_id = 75"
            )
            cityTeamRelQ = mysql_tools.mysqlQuery(sql, infoModule.info.site["dblink"])
            while 1:
                cityTeamRel = cityTeamRelQ.fetch_row(1, 1)
                if cityTeamRel == ():
                    break
                key = int(cityTeamRel[0]["cid_2"])
                value = int(cityTeamRel[0]["relevance"])
                cityDict[cityIterator][key] = value
            cityIterator += 1

    for id in athleteIds:
        sql = (
            "SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = "
            + id
            + " AND mptype_id = 75"
        )
        athleteTeamRelQ = mysql_tools.mysqlQuery(sql, infoModule.info.site["dblink"])
        athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1)
        while 1:
            athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1)
            if athleteTeamRel == ():
                break
            key = int(athleteTeamRel[0]["cid_2"])
            value = int(athleteTeamRel[0]["relevance"])
            if key in athleteDict:
                athleteDict[key] += value
            else:
                athleteDict[key] = value

    if not athleteDict and not cityDict:
        return False
    # add cityDict values to athleteDict values to get one master dict of team => relevance
    if cityDict and athleteDict:
        cityIterator = 1
        while cityIterator <= cityidIndexCnt:
            for key, value in cityDict[cityIterator].items():
                if key in athleteDict:
                    athleteDict[key] += cityDict[cityIterator][key]
                else:
                    athleteDict[key] = value
            sortedFinalDict = sorted(athleteDict.iteritems(), key=operator.itemgetter(1), reverse=True)
            log.plog("dict of team => relevance: ", 2)
            for key, value in sortedFinalDict.items():
                log.plog(str(key) + " => " + str(value))
            return sortedFinalDict[0]