def getEntities(searchText, title, jsonOut=True, byID=False): infoModule.info.page['outline'] = searchText infoModule.info.page['title'] = title entities.entityFinder(title + ' ' + searchText, True) entities.nicknameFinder(title + ' ' + searchText, True, True) entities.setPrimo() #pprint.pprint(infoModule.info.entityList) entityList = infoModule.info.entityList if jsonOut == True: #res = json.dumps(infoModule.info.entityList) #pprint.pprint(res) if len(entityList) > 0: ents = entityList.keys() # hacky JSON building; python's json module outputs slightly different than what we need res = '[' for ids in ents: entityName = entityLib.entityLibrary(ids, 'entityName') celeb_type = entityLib.entityLibrary(ids, 'celeb_type') linkPath = entityLib.entityLibrary(ids, 'linkPath') entityURL = entityLib.entityLibrary(ids, 'lookupUrl') if byID: #swap URL for ID entityURL = str(ids) if celeb_type != 'hidden': if 'nameUsed' in entityList[ids]: #this means there's a nickname response res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '","nameUsed":"' + entityList[ids]['nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},' else: res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},' res = res[:-1] + ']' #catch for nothing but hiddens if res == ']': res = '' #pprint.pprint(res) else: res = '' return res
def getTeamFromCity(): ################################################################### ##func: getRelevantEntity ##param: none (takes entities from infoModule.info.entityList) ##desc: determines missing team given city(s) and athlete(s), by ## summing the relevance scores of story city(s)/all teams with ## story athlete(s)/all teams. ##ret: cid of team with highest relevance ##auth: mdk ################################################################## cityIds = [] athleteIds = [] for cid in infoModule.info.entityList.keys(): cidType = entityLib.entityLibrary(cid, 'celeb_type') if cidType == 'Team': return False if cidType == 'City': cityIds.append(cid) elif cidType == 'Athlete': athleteIds.append(cid) # else: # print cidType + " " + str(cid) # exit if no cities identified if len(cityIds) == 0: return False # exit if no athletes identified if len(athleteIds) == 0: return False #log.plog('cityIds: ' + cityIds + 'athleteIds: ' + athleteIds, 2) athleteDict = {} cityDict = {} if len(cityIds) > 1: cityidIndexCnt = len(cityIds) else: cityidIndexCnt = 1 cityIterator = 1 while cityIterator <= cityidIndexCnt: for id in cityIds: cityDict[cityIterator] = {} # create dict of team => relevance sql = 'SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = ' + id + ' AND mptype_id = 75' cityTeamRelQ = mysql_tools.mysqlQuery( sql, infoModule.info.site['dblink']) while (1): cityTeamRel = cityTeamRelQ.fetch_row(1, 1) if cityTeamRel == (): break key = int(cityTeamRel[0]['cid_2']) value = int(cityTeamRel[0]['relevance']) cityDict[cityIterator][key] = value cityIterator += 1 for id in athleteIds: sql = 'SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = ' + id + ' AND mptype_id = 75' athleteTeamRelQ = mysql_tools.mysqlQuery( sql, infoModule.info.site['dblink']) athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1) while (1): athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1) if athleteTeamRel == (): break key = int(athleteTeamRel[0]['cid_2']) value = int(athleteTeamRel[0]['relevance']) if key in athleteDict: athleteDict[key] += value else: athleteDict[key] = value if not athleteDict and not cityDict: return False # add cityDict values to athleteDict values to get one master dict of team => relevance if cityDict and athleteDict: cityIterator = 1 while cityIterator <= cityidIndexCnt: for key, value in cityDict[cityIterator].items(): if key in athleteDict: athleteDict[key] += cityDict[cityIterator][key] else: athleteDict[key] = value sortedFinalDict = sorted(athleteDict.iteritems(), key=operator.itemgetter(1), reverse=True) log.plog('picked team => ' + str(sortedFinalDict[0]), 2) return sortedFinalDict[0]
def addMicrodata(text, entities, fullNames=False): # mptypes by category organizations = [37, 43, 44, 48, 49, 66, 76, 79, 104, 110, 120] if len(entities) == 0: log.plog("no entities passed to addMicrodata", 5) return text if (text == None or text == ''): log.plog("no text passed to addMicrodata", 5) return text #celebList should be sorted from long to short entityNameArray = [] for entity in entities: entityNameArray.append( [entity, len(entityLib.entityLibrary(entity, 'entityName'))]) entityNameArray = sorted(entityNameArray, key=lambda nameLen: nameLen[1], reverse=True) htmlBlocks = [] #set aside all html so that celeb matches are not made inside ctr = 0 while True: reres = re.search('<.*?>', text) if reres == None: break htmlBlocks.append(reres.group(0)) text = text.replace(reres.group(0), "~*~%d~*~" % ctr) # print"TEXT: " + text ctr += 1 for entityTuple in entityNameArray: pprint.pprint(entityTuple) if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': name = entityLib.entityLibrary(entityTuple[0], 'entityName') name = name.strip() itemtype = itemprop = None # Person microdata if (name != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1))): itemtype = 'Person' itemprop = 'name' if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations: itemtype = 'Person' itemprop = 'affiliation' if itemtype and itemprop != None: itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">''' step1 = re.search( "([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace( step1.group(0), step1.group(1) + itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>' + step1.group(2) + step1.group(3)) step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace( step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>' + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + name + '$', text, re.I) if step3 != None: text = text.replace( step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>') # now check by first and last name #in between passes, alter text to have ~(num)~ where the celeb blocks are tmpArray = [] tmpCtr = 0 # the num points to the array containing the bit so that it can be rebuilt after nicknames are run. if fullNames == False: #only do first and last name if the fullNames (meaning require full name) isn't set to true while True: squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text) pprint.pprint(squareBlocks) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 for entityTuple in entityNameArray: pprint.pprint(entityTuple) if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': lname = entityLib.entityLibrary(entityTuple[0], 'lname') if lname != None: lname = lname.strip() fname = entityLib.entityLibrary(entityTuple[0], 'fname') if fname != None: fname = fname.strip() itemtype = itemprop = None if (lname != '' and (entityLib.entityLibrary( entityTuple[0], 'human') == str(1))): itemtype = 'Person' itemprop = 'name' if (fname != '' and (entityLib.entityLibrary( entityTuple[0], 'human') == str(1))): itemtype = 'Person' itemprop = 'name' # affiliation microdata # Get mytype_id and see if res is in organization array if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations: itemtype = 'Person' itemprop = 'affiliation' if itemtype and itemprop != None: itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">''' # don't use first and last name if fname or lname is a number try: int(fname) except ValueError: if fname != None and fname != '': step1 = re.search( "([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace( step1.group(0), step1.group(1) + itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>' + step1.group(2) + step1.group(3)) step2 = re.search( "^" + fname + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace( step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>' + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + fname + '$', text, re.I) if step3 != None: text = text.replace( step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>') try: int(lname) except ValueError: if lname != None and lname != '': step1 = re.search( "([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace( step1.group(0), step1.group(1) + itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>' + step1.group(2) + step1.group(3)) step2 = re.search( "^" + lname + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace( step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>' + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + lname + '$', text, re.I) if step3 != None: text = text.replace( step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>') #nicknames, after the rest are done #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are # the num points to the array containing the bit so that it can be rebuilt after nicknames are run. while True: squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 for entityTuple in entityNameArray: if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': itemtype = itemprop = None if entityLib.entityLibrary(entityTuple[0], 'human') == str(1): itemtype = 'Person' itemprop = 'name' # affiliation microdata # Get mytype_id and see if res is in organization array if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations: itemtype = 'Person' itemprop = 'affiliation' if itemtype and itemprop != None: itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">''' nicknamesQ = mysql_tools.mysqlQuery( "select name, case_sensitive from db_topics.nicknames where cid_1=" + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]), infoModule.info.site['dblink']) while True: nicknameRow = nicknamesQ.fetch_row(1, 1) if nicknameRow == (): break if nicknameRow[0]['case_sensitive'] == 1: nicknameMatch = re.search( '\\b' + nicknameRow[0]['name'] + '\\b', text, re.I) else: nicknameMatch = re.search( '\\b' + nicknameRow[0]['name'] + '\\b', text) if nicknameMatch and itemtype != None: text = text.replace( nicknameMatch.group(0), itemscope + '<span itemprop="' + itemprop + '">' + nicknameMatch.group(0) + '</span></span>') #take it out and store it off to prevent more dupes while True: squareBlocks = re.search( '<span itemscope.*?<\/span><\/span>', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 while True: repBlock = re.search('~#~(\d+)~#~', text) if repBlock == None: break text = text.replace(repBlock.group(0), tmpArray[int(repBlock.group(1))]) # text = text.replace("[/celeb]'s", "'s[/celeb]") # text = text.replace("[/celeb]s", "s[/celeb]") for i in range(len(htmlBlocks)): text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i]) return text
def setPrimo(): ################################################################### ##func: setPrimo ##param: none ##desc: takes no params. Instead, as long as page title, page text ## and celebList are properly put together, it will rank the entities ## for primo position ##ret: None ##auth: esr ################################################################## if 'outline' not in infoModule.info.page or infoModule.info.page[ 'outline'] == '': log.plog('no outline set before setPrimo called', 5) return if 'title' not in infoModule.info.page or infoModule.info.page[ 'title'] == '': log.plog('no title set before setPrimo called', 5) return if len(infoModule.info.entityList) == 0: log.plog('no entities set before setPrimo called', 5) return #highlight text as a way to count the frequency highlightedText = highlightEntities.highlightEntitiesFromList( infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], infoModule.info.entityList) while True: taggedEntity = re.search('\[celeb url=(.+?)\]', highlightedText) if taggedEntity == None: break highlightedText = highlightedText.replace(taggedEntity.group(0), '', 1) lookupUrl = taggedEntity.group(1) sql = "select celeb_id from db_topics.celebs where lookupUrl='" + taggedEntity.group( 1) + "'" entityByLookupQ = mysql_tools.mysqlQuery( sql, infoModule.info.site['dblink']) while (1): entityByLookup = entityByLookupQ.fetch_row(1, 1) if entityByLookup == (): break if infoModule.info.entityList[entityByLookup[0] ['celeb_id']]['frequency'] == 0: infoModule.info.entityList[entityByLookup[0] ['celeb_id']]['frequency'] = 1 else: infoModule.info.entityList[entityByLookup[0] ['celeb_id']]['frequency'] += 1 # now frequency is set, first two are based on position, next two based on frequency primoTypes = ['Y', '2', '3', '4', 'N'] primoTypeCursor = 0 #if source is associated with entity, it's auto-set to primo if 'celeb_id' in infoModule.info.source and infoModule.info.source[ 'celeb_id'] != '' and int(infoModule.info.source['celeb_id']) > 0: infoModule.info.entityList[infoModule.info.source['celeb_id']][ 'primo'] = primoTypes[primoTypeCursor] primoTypeCursor += 1 posArray = [] entKeys = infoModule.info.entityList.keys() for eKey in entKeys: if infoModule.info.entityList[eKey]['position'] != None: posArray.append( [eKey, infoModule.info.entityList[eKey]['position']]) posArray.sort(key=lambda x: x[1]) freqArray = [] entKeys = infoModule.info.entityList.keys() for eKey in entKeys: if infoModule.info.entityList[eKey][ 'frequency'] != None and infoModule.info.entityList[eKey][ 'frequency'] > 1: freqArray.append( [eKey, infoModule.info.entityList[eKey]['frequency']]) freqArray.sort(key=lambda x: x[1], reverse=True) titleLen = len(infoModule.info.page['title']) #primo set by position in title ctr = 0 while primoTypeCursor < 4 and len(posArray) > ctr: if posArray[ctr][1] > titleLen: break ## only non-hidden entities can be primo if entityLib.entityLibrary(posArray[ctr][0], 'visibility') != 'invisible': log.plog( "entity %s found in title, setting to primo %s" % (posArray[ctr][0], primoTypes[primoTypeCursor]), 3) infoModule.info.entityList[ posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor] primoTypeCursor += 1 ctr += 1 #primo set by frequency ctr = 0 while primoTypeCursor < 4 and ctr < len(freqArray): if infoModule.info.entityList[freqArray[ctr][0]]['primo'] == 'N': ## only non-hidden entities can be primo if entityLib.entityLibrary(freqArray[ctr][0], 'visibility') != 'invisible': log.plog( "entity %s has high frequency, setting to primo %s" % (freqArray[ctr][0], primoTypes[primoTypeCursor]), 3) infoModule.info.entityList[ freqArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor] primoTypeCursor += 1 ctr += 1 #if nothing found, go to first position and set it to primo if primoTypeCursor == 0: ctr = 0 while len(posArray) > ctr: ## only non-hidden entities can be primo if entityLib.entityLibrary(posArray[ctr][0], 'visibility') != 'invisible': log.plog( "entity %s found first in story, setting to primo %s" % (posArray[ctr][0], primoTypes[primoTypeCursor]), 3) infoModule.info.entityList[ posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor] primoTypeCursor += 1 break ctr += 1
link = mysql_tools.mysqlConnect() if link == False : print "no connection" sys.exit(0) infoModule.info.site['dblink'] = link infoModule.info.site['log_priority_threshold'] = 3 if len(sys.argv) > 1 and int(sys.argv[1]) > 0: #get specific sub_id print "looking up entity_id: " + sys.argv[1] cid = sys.argv[1] print "%s:" % cid print "\t%s" % entityLib.entityLibrary(int(cid), 'entityName') print "\t%s" % entityLib.entityLibrary(int(cid), 'lookupUrl') print "\t%s" % entityLib.entityLibrary(int(cid), 'bio') else: sql = "select celeb_id from db_topics.celebs order by rand() limit 20" er = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if er == False: print "select random entities failed" sys.exit(0) while (1): row=er.fetch_row(1,1)
def setPrimo(): ################################################################### ##func: setPrimo ##param: none ##desc: takes no params. Instead, as long as page title, page text ## and celebList are properly put together, it will rank the entities ## for primo position ##ret: None ##auth: esr ################################################################## if 'outline' not in infoModule.info.page or infoModule.info.page['outline'] == '': log.plog('no outline set before setPrimo called', 5) return if 'title' not in infoModule.info.page or infoModule.info.page['title'] == '': log.plog('no title set before setPrimo called', 5) return if len(infoModule.info.entityList) == 0: log.plog('no entities set before setPrimo called', 5) return #highlight text as a way to count the frequency highlightedText = highlightEntities.highlightEntitiesFromList(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], infoModule.info.entityList) while True: taggedEntity = re.search('\[celeb url=(.+?)\]', highlightedText) if taggedEntity == None: break highlightedText = highlightedText.replace(taggedEntity.group(0), '', 1) lookupUrl = taggedEntity.group(1) sql = "select celeb_id from db_topics.celebs where lookupUrl='" + taggedEntity.group(1) + "'" entityByLookupQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) while (1): entityByLookup=entityByLookupQ.fetch_row(1,1) if entityByLookup == (): break if infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] == 0: infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] = 1 else: infoModule.info.entityList[entityByLookup[0]['celeb_id']]['frequency'] += 1 # now frequency is set, first two are based on position, next two based on frequency primoTypes = ['Y', '2', '3', '4', 'N'] primoTypeCursor = 0 #if source is associated with entity, it's auto-set to primo if 'celeb_id' in infoModule.info.source and infoModule.info.source['celeb_id'] != '' and int(infoModule.info.source['celeb_id']) > 0: infoModule.info.entityList[infoModule.info.source['celeb_id']]['primo'] = primoTypes[primoTypeCursor] primoTypeCursor += 1 posArray = [] entKeys = infoModule.info.entityList.keys() for eKey in entKeys: if infoModule.info.entityList[eKey]['position'] != None: posArray.append([eKey, infoModule.info.entityList[eKey]['position'] ]) posArray.sort(key=lambda x: x[1]) freqArray = [] entKeys = infoModule.info.entityList.keys() for eKey in entKeys: if infoModule.info.entityList[eKey]['frequency'] != None and infoModule.info.entityList[eKey]['frequency'] > 1: freqArray.append([eKey, infoModule.info.entityList[eKey]['frequency'] ]) freqArray.sort(key=lambda x: x[1], reverse=True) titleLen = len(infoModule.info.page['title']) #primo set by position in title ctr = 0 while primoTypeCursor < 4 and len(posArray) > ctr: if posArray[ctr][1] > titleLen: break ## only non-hidden entities can be primo if entityLib.entityLibrary(posArray[ctr][0], 'visibility') != 'invisible': log.plog("entity %s found in title, setting to primo %s" % (posArray[ctr][0], primoTypes[primoTypeCursor]), 3) infoModule.info.entityList[posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor] primoTypeCursor += 1 ctr += 1 #primo set by frequency ctr = 0 while primoTypeCursor < 4 and ctr < len(freqArray): if infoModule.info.entityList[freqArray[ctr][0]]['primo'] == 'N': ## only non-hidden entities can be primo if entityLib.entityLibrary(freqArray[ctr][0], 'visibility') != 'invisible': log.plog("entity %s has high frequency, setting to primo %s" % (freqArray[ctr][0], primoTypes[primoTypeCursor]), 3) infoModule.info.entityList[freqArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor] primoTypeCursor += 1 ctr += 1 #if nothing found, go to first position and set it to primo if primoTypeCursor == 0: ctr = 0 while len(posArray) > ctr: ## only non-hidden entities can be primo if entityLib.entityLibrary(posArray[ctr][0], 'visibility') != 'invisible': log.plog("entity %s found first in story, setting to primo %s" % (posArray[ctr][0], primoTypes[primoTypeCursor]), 3) infoModule.info.entityList[posArray[ctr][0]]['primo'] = primoTypes[primoTypeCursor] primoTypeCursor += 1 break ctr += 1
def highlightEntitiesFromList(text, entities, fullNames=False): ################################################################### ##func: highlightEntitiesFromList ##param: text, entities ##desc: takes a list of entities (id numbers) and finds all name matches ## and nickname matches and wraps those in the text with ## [celeb url="celeb-url"][/celeb] ## in the source_reader context, it will probably be phased out ## its current usage is to mark up the outline of a page and to ## be a tool for determining the frequency with which entities appear ##ret: string ##auth: esr ################################################################## if len(entities) == 0: log.plog("no entities passed to highlightEntitiesFromList", 5) return text if (text == None or text == ''): log.plog("no text passed to highlightEntitiesFromList", 5) return text #celebList should be sorted from long to short entityNameArray = [] for entity in entities: entityNameArray.append( [entity, len(entityLib.entityLibrary(entity, 'entityName'))]) entityNameArray = sorted(entityNameArray, key=lambda nameLen: nameLen[1], reverse=True) htmlBlocks = [] #set aside all html so that celeb matches are not made inside ctr = 0 while True: reres = re.search('<.*?>', text) if reres == None: break htmlBlocks.append(reres.group(0)) text = text.replace(reres.group(0), "~*~%d~*~" % ctr) ctr += 1 for entityTuple in entityNameArray: if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': name = entityLib.entityLibrary(entityTuple[0], 'entityName') name = name.strip() if name != '': lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl') step1 = re.search( "([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace( step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + name + "[/celeb]" + step1.group(2) + step1.group(3)) step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace( step2.group(0), "[celeb url=" + lookupUrl + "]" + name + "[/celeb]" + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + name + '$', text, re.I) if step3 != None: text = text.replace( step3.group(0), "[celeb url=" + lookupUrl + "]" + name + "[/celeb]") # now check by first and last name #in between passes, alter text to have ~(num)~ where the celeb blocks are tmpArray = [] tmpCtr = 0 # the num points to the array containing the bit so that it can be rebuilt after nicknames are run. if fullNames == False: #only do first and last name if the fullNames (meaning require full name) isn't set to true while True: squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 for entityTuple in entityNameArray: if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': lname = entityLib.entityLibrary(entityTuple[0], 'lname') if lname != None: lname = lname.strip() fname = entityLib.entityLibrary(entityTuple[0], 'fname') if fname != None: fname = fname.strip() lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl') # don't use first and last name if fname or lname is a number try: int(fname) except ValueError: if fname != None and fname != '': step1 = re.search( "([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace( step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + fname + "[/celeb]" + step1.group(2) + step1.group(3)) step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace( step2.group(0), "[celeb url=" + lookupUrl + "]" + fname + "[/celeb]" + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + fname + '$', text, re.I) if step3 != None: text = text.replace( step3.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]") try: int(lname) except ValueError: if lname != None and lname != '': step1 = re.search( "([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace( step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]" + step1.group(2) + step1.group(3)) step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace( step2.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]" + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + lname + '$', text, re.I) if step3 != None: text = text.replace( step3.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]") #nicknames, after the rest are done #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are # the num points to the array containing the bit so that it can be rebuilt after nicknames are run. while True: squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 for entityTuple in entityNameArray: if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl') nicknamesQ = mysql_tools.mysqlQuery( "select name, case_sensitive from db_topics.nicknames where cid_1=" + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]), infoModule.info.site['dblink']) while True: nicknameRow = nicknamesQ.fetch_row(1, 1) if nicknameRow == (): break if nicknameRow[0]['case_sensitive'] == 1: nicknameMatch = re.search( '\\b' + nicknameRow[0]['name'] + '\\b', text, re.I) else: nicknameMatch = re.search( '\\b' + nicknameRow[0]['name'] + '\\b', text) if nicknameMatch != None: text = text.replace( nicknameMatch.group(0), "[celeb url=" + lookupUrl + "]" + nicknameMatch.group(0) + "[/celeb]") #take it out and store it off to prevent more dupes while True: squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 while True: repBlock = re.search('~#~(\d+)~#~', text) if repBlock == None: break text = text.replace(repBlock.group(0), tmpArray[int(repBlock.group(1))]) text = text.replace("[/celeb]'s", "'s[/celeb]") text = text.replace("[/celeb]s", "s[/celeb]") for i in range(len(htmlBlocks)): text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i]) return text
def getIds(sub_id): sql = 'SELECT celeb_id FROM ' + infoModule.info.site[ 'database'] + '.subs_celebs WHERE sub_id = ' + str(sub_id) entityIdsQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) entityIds = entityIdsQ.fetch_row(0, 1) entityRows = [] for row in entityIds: infoModule.info.entityList[row['celeb_id']] = { 'position': None, 'frequency': 0, 'primo': 'N' } if (1): # http://dev.sportifi.com/news/Carmelo-about-to-put-fans-in-an-awkward-position-52139.html getIds(52139) # delete team for testing purposes del infoModule.info.entityList['853957'] for cid in infoModule.info.entityList.keys(): cidType = entityLib.entityLibrary(cid, 'celeb_type') print str(cid) + " - " + cidType result = getRelevantEntity() print result
def getEntities(searchText, title, jsonOut=True, byID=False): infoModule.info.page['outline'] = searchText infoModule.info.page['title'] = title entities.entityFinder(title + ' ' + searchText, True) entities.nicknameFinder(title + ' ' + searchText, True, True) entities.setPrimo() #pprint.pprint(infoModule.info.entityList) entityList = infoModule.info.entityList if jsonOut == True: #res = json.dumps(infoModule.info.entityList) #pprint.pprint(res) if len(entityList) > 0: ents = entityList.keys() # hacky JSON building; python's json module outputs slightly different than what we need res = '[' for ids in ents: entityName = entityLib.entityLibrary(ids, 'entityName') celeb_type = entityLib.entityLibrary(ids, 'celeb_type') linkPath = entityLib.entityLibrary(ids, 'linkPath') entityURL = entityLib.entityLibrary(ids, 'lookupUrl') if byID: #swap URL for ID entityURL = str(ids) if celeb_type != 'hidden': if 'nameUsed' in entityList[ids]: #this means there's a nickname response res = res + '{"id":"' + str( entityURL ) + '","name":"' + str( entityName ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str( celeb_type ) + '","nameUsed":"' + entityList[ids][ 'nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str( linkPath ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str( linkPath ) + '","icon":"http://informifi.com/fi_icon.png"}]},' else: res = res + '{"id":"' + str( entityURL ) + '","name":"' + str( entityName ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str( celeb_type ) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str( linkPath ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str( linkPath ) + '","icon":"http://informifi.com/fi_icon.png"}]},' res = res[:-1] + ']' #catch for nothing but hiddens if res == ']': res = '' #pprint.pprint(res) else: res = '' return res
def addMicrodata(text, entities, fullNames = False): # mptypes by category organizations = [37,43,44,48,49,66,76,79,104,110,120] if len(entities) == 0: log.plog("no entities passed to addMicrodata", 5) return text if (text == None or text == ''): log.plog("no text passed to addMicrodata", 5) return text #celebList should be sorted from long to short entityNameArray=[] for entity in entities: entityNameArray.append( [entity, len(entityLib.entityLibrary(entity, 'entityName'))] ) entityNameArray = sorted(entityNameArray, key=lambda nameLen: nameLen[1], reverse=True) htmlBlocks = [] #set aside all html so that celeb matches are not made inside ctr = 0 while True: reres = re.search('<.*?>', text) if reres == None: break htmlBlocks.append(reres.group(0)) text = text.replace(reres.group(0), "~*~%d~*~" % ctr) # print"TEXT: " + text ctr += 1 for entityTuple in entityNameArray: pprint.pprint(entityTuple) if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': name = entityLib.entityLibrary(entityTuple[0], 'entityName') name = name.strip() itemtype = itemprop = None # Person microdata if ( name != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ): itemtype = 'Person' itemprop = 'name' if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations: itemtype = 'Person' itemprop = 'affiliation' if itemtype and itemprop != None: itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">''' step1 = re.search("([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace(step1.group(0), step1.group(1) + itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>' + step1.group(2) + step1.group(3)) step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>' + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + name + '$', text, re.I) if step3 != None: text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + name + '</span></span>') # now check by first and last name #in between passes, alter text to have ~(num)~ where the celeb blocks are tmpArray = [] tmpCtr = 0 # the num points to the array containing the bit so that it can be rebuilt after nicknames are run. if fullNames == False: #only do first and last name if the fullNames (meaning require full name) isn't set to true while True: squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text) pprint.pprint(squareBlocks) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 for entityTuple in entityNameArray: pprint.pprint(entityTuple) if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': lname = entityLib.entityLibrary(entityTuple[0], 'lname') if lname != None: lname = lname.strip() fname = entityLib.entityLibrary(entityTuple[0], 'fname') if fname != None: fname = fname.strip() itemtype = itemprop = None if ( lname != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ): itemtype = 'Person' itemprop = 'name' if ( fname != '' and (entityLib.entityLibrary(entityTuple[0], 'human') == str(1)) ): itemtype = 'Person' itemprop = 'name' # affiliation microdata # Get mytype_id and see if res is in organization array if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations: itemtype = 'Person' itemprop = 'affiliation' if itemtype and itemprop != None: itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">''' # don't use first and last name if fname or lname is a number try: int(fname) except ValueError: if fname != None and fname != '': step1 = re.search("([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace(step1.group(0), step1.group(1) + itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>' + step1.group(2) + step1.group(3)) step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>' + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + fname + '$', text, re.I) if step3 != None: text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + fname + '</span></span>') try: int(lname) except ValueError: if lname != None and lname != '': step1 = re.search("([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace(step1.group(0), step1.group(1) + itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>' + step1.group(2) + step1.group(3)) step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace(step2.group(0), itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>' + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + lname + '$', text, re.I) if step3 != None: text = text.replace(step3.group(0), itemscope + '<span itemprop="' + itemprop + '">' + lname + '</span></span>') #nicknames, after the rest are done #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are # the num points to the array containing the bit so that it can be rebuilt after nicknames are run. while True: squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 for entityTuple in entityNameArray: if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': itemtype = itemprop = None if entityLib.entityLibrary(entityTuple[0], 'human') == str(1): itemtype = 'Person' itemprop = 'name' # affiliation microdata # Get mytype_id and see if res is in organization array if int(entityLib.entityLibrary(entityTuple[0], 'mptype_id')) in organizations: itemtype = 'Person' itemprop = 'affiliation' if itemtype and itemprop != None: itemscope = '''<span itemscope itemtype="http://www.data-vocabulary.org/''' + itemtype + '''">''' nicknamesQ = mysql_tools.mysqlQuery("select name, case_sensitive from db_topics.nicknames where cid_1=" + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]) , infoModule.info.site['dblink']) while True: nicknameRow=nicknamesQ.fetch_row(1,1) if nicknameRow == (): break if nicknameRow[0]['case_sensitive'] == 1: nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text, re.I) else: nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text) if nicknameMatch and itemtype != None: text = text.replace(nicknameMatch.group(0), itemscope + '<span itemprop="' + itemprop + '">' + nicknameMatch.group(0) + '</span></span>') #take it out and store it off to prevent more dupes while True: squareBlocks = re.search('<span itemscope.*?<\/span><\/span>', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 while True: repBlock = re.search('~#~(\d+)~#~', text) if repBlock == None: break text = text.replace(repBlock.group(0), tmpArray[int(repBlock.group(1))]) # text = text.replace("[/celeb]'s", "'s[/celeb]") # text = text.replace("[/celeb]s", "s[/celeb]") for i in range(len(htmlBlocks)): text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i]) return text
sys.exit(0) # set globals for site infoModule.info.site['database'] = 'db_sportifi' infoModule.info.site['dblink'] = link def getIds(sub_id): sql = 'SELECT celeb_id FROM ' + infoModule.info.site['database'] + '.subs_celebs WHERE sub_id = ' + str(sub_id) entityIdsQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) entityIds = entityIdsQ.fetch_row(0,1) entityRows = [] for row in entityIds: infoModule.info.entityList[row['celeb_id']] = {'position': None, 'frequency': 0, 'primo' : 'N'} if (1): # http://dev.sportifi.com/news/Carmelo-about-to-put-fans-in-an-awkward-position-52139.html getIds(52139) # delete team for testing purposes del infoModule.info.entityList['853957'] for cid in infoModule.info.entityList.keys(): cidType = entityLib.entityLibrary(cid,'celeb_type') print str(cid) + " - " + cidType result = getRelevantEntity() print result
def highlightEntitiesFromList(text, entities, fullNames = False): ################################################################### ##func: highlightEntitiesFromList ##param: text, entities ##desc: takes a list of entities (id numbers) and finds all name matches ## and nickname matches and wraps those in the text with ## [celeb url="celeb-url"][/celeb] ## in the source_reader context, it will probably be phased out ## its current usage is to mark up the outline of a page and to ## be a tool for determining the frequency with which entities appear ##ret: string ##auth: esr ################################################################## if len(entities) == 0: log.plog("no entities passed to highlightEntitiesFromList", 5) return text if (text == None or text == ''): log.plog("no text passed to highlightEntitiesFromList", 5) return text #celebList should be sorted from long to short entityNameArray=[] for entity in entities: entityNameArray.append( [entity, len(entityLib.entityLibrary(entity, 'entityName'))] ) entityNameArray = sorted(entityNameArray, key=lambda nameLen: nameLen[1], reverse=True) htmlBlocks = [] #set aside all html so that celeb matches are not made inside ctr = 0 while True: reres = re.search('<.*?>', text) if reres == None: break htmlBlocks.append(reres.group(0)) text = text.replace(reres.group(0), "~*~%d~*~" % ctr) ctr += 1 for entityTuple in entityNameArray: if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': name = entityLib.entityLibrary(entityTuple[0], 'entityName') name = name.strip() if name != '': lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl') step1 = re.search("([^\]=-])\\b" + name + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + name + "[/celeb]" + step1.group(2) + step1.group(3)) step2 = re.search("^" + name + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + name + "[/celeb]" + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + name + '$', text, re.I) if step3 != None: text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + name + "[/celeb]") # now check by first and last name #in between passes, alter text to have ~(num)~ where the celeb blocks are tmpArray = [] tmpCtr = 0 # the num points to the array containing the bit so that it can be rebuilt after nicknames are run. if fullNames == False: #only do first and last name if the fullNames (meaning require full name) isn't set to true while True: squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 for entityTuple in entityNameArray: if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': lname = entityLib.entityLibrary(entityTuple[0], 'lname') if lname != None: lname = lname.strip() fname = entityLib.entityLibrary(entityTuple[0], 'fname') if fname != None: fname = fname.strip() lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl') # don't use first and last name if fname or lname is a number try: int(fname) except ValueError: if fname != None and fname != '': step1 = re.search("([^\]=-])\\b" + fname + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + fname + "[/celeb]" + step1.group(2) + step1.group(3)) step2 = re.search("^" + fname + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + fname + "[/celeb]" + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + fname + '$', text, re.I) if step3 != None: text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]") try: int(lname) except ValueError: if lname != None and lname != '': step1 = re.search("([^\]=-])\\b" + lname + "('*s*)\\b([^\[]){1}", text, re.I) if step1 != None: text = text.replace(step1.group(0), step1.group(1) + "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]" + step1.group(2) + step1.group(3)) step2 = re.search("^" + lname + "(\'*s*)\\b([^\[]){1}", text) if step2 != None: text = text.replace(step2.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]" + step2.group(1) + step2.group(2)) step3 = re.search('\\b' + lname + '$', text, re.I) if step3 != None: text = text.replace(step3.group(0), "[celeb url=" + lookupUrl + "]" + lname + "[/celeb]") #nicknames, after the rest are done #to prep for nicknames, alter text to have ~(num)~ where the celeb blocks are # the num points to the array containing the bit so that it can be rebuilt after nicknames are run. while True: squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 for entityTuple in entityNameArray: if entityLib.entityLibrary(entityTuple[0], 'visibility') != 'invisible': lookupUrl = entityLib.entityLibrary(entityTuple[0], 'lookupUrl') nicknamesQ = mysql_tools.mysqlQuery("select name, case_sensitive from db_topics.nicknames where cid_1=" + str(entityTuple[0]) + " or cid_2=" + str(entityTuple[0]) + " or cid_3=" + str(entityTuple[0]) , infoModule.info.site['dblink']) while True: nicknameRow=nicknamesQ.fetch_row(1,1) if nicknameRow == (): break if nicknameRow[0]['case_sensitive'] == 1: nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text, re.I) else: nicknameMatch = re.search('\\b' + nicknameRow[0]['name'] + '\\b', text) if nicknameMatch != None: text = text.replace(nicknameMatch.group(0), "[celeb url=" + lookupUrl + "]" + nicknameMatch.group(0) + "[/celeb]") #take it out and store it off to prevent more dupes while True: squareBlocks = re.search('\[celeb url.*?\[\/celeb\]', text) if squareBlocks == None: break tmpArray.append(squareBlocks.group(0)) text = text.replace(squareBlocks.group(0), "~#~" + str(tmpCtr) + "~#~") tmpCtr += 1 while True: repBlock = re.search('~#~(\d+)~#~', text) if repBlock == None: break text = text.replace(repBlock.group(0), tmpArray[int(repBlock.group(1))]) text = text.replace("[/celeb]'s", "'s[/celeb]") text = text.replace("[/celeb]s", "s[/celeb]") for i in range(len(htmlBlocks)): text = text.replace('~*~' + str(i) + '~*~', htmlBlocks[i]) return text
def getRelevantEntity(): ################################################################### ##func: getRelevantEntity ##param: none (takes entities from infoModule.info.entityList) ##desc: determines missing team given city(s) and athlete(s), by ## summing the relevance scores of story city(s)/all teams with ## story athlete(s)/all teams. ##ret: cid of team with highest relevance ##auth: mdk ################################################################## # build lists of entities cityIds = [] athleteIds = [] for cid in infoModule.info.entityList.keys(): cidType = entityLib.entityLibrary(cid, "celeb_type") if cidType == "Team": return False if cidType == "City": cityIds.append(cid) if cidType == "Athlete": athleteIds.append(cid) # else: # print cidType + " " + str(cid) if len(cityIds) == 0: return False if len(athleteIds) == 0: return False log.plog("City cids: ") for id in cityIds: log.plog(id, 2) log.plog("Athlete cids: ") for id in athleteIds: log.plog(id, 2) athleteDict = {} cityDict = {} if len(cityIds) > 1: cityidIndexCnt = len(cityIds) else: cityidIndexCnt = 1 cityIterator = 1 while cityIterator <= cityidIndexCnt: for id in cityIds: cityDict[cityIterator] = {} # create dict of team => relevance sql = ( "SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = " + id + " AND mptype_id = 75" ) cityTeamRelQ = mysql_tools.mysqlQuery(sql, infoModule.info.site["dblink"]) while 1: cityTeamRel = cityTeamRelQ.fetch_row(1, 1) if cityTeamRel == (): break key = int(cityTeamRel[0]["cid_2"]) value = int(cityTeamRel[0]["relevance"]) cityDict[cityIterator][key] = value cityIterator += 1 for id in athleteIds: sql = ( "SELECT cid_2, relevance FROM db_topics.celebs_related, db_topics.celebs WHERE celebs_related.cid_2=celebs.celeb_id and cid_1 = " + id + " AND mptype_id = 75" ) athleteTeamRelQ = mysql_tools.mysqlQuery(sql, infoModule.info.site["dblink"]) athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1) while 1: athleteTeamRel = athleteTeamRelQ.fetch_row(1, 1) if athleteTeamRel == (): break key = int(athleteTeamRel[0]["cid_2"]) value = int(athleteTeamRel[0]["relevance"]) if key in athleteDict: athleteDict[key] += value else: athleteDict[key] = value if not athleteDict and not cityDict: return False # add cityDict values to athleteDict values to get one master dict of team => relevance if cityDict and athleteDict: cityIterator = 1 while cityIterator <= cityidIndexCnt: for key, value in cityDict[cityIterator].items(): if key in athleteDict: athleteDict[key] += cityDict[cityIterator][key] else: athleteDict[key] = value sortedFinalDict = sorted(athleteDict.iteritems(), key=operator.itemgetter(1), reverse=True) log.plog("dict of team => relevance: ", 2) for key, value in sortedFinalDict.items(): log.plog(str(key) + " => " + str(value)) return sortedFinalDict[0]