def findEntityCategory(entFile, categoryFile, instanceFile): categoryList = loadCategories(categoryFile) instanceList = loadInstancesInList(instanceFile) for line in open(entFile, 'r'): #entry= str(line.strip().lower()) #entry = entry[2:-1].strip() #entry = unicode(entry1) spotDict = ast.literal_eval(line.lower()) spots = spotDict['spots'] i = 0 for spot in spots: ename = (spot['wikiname']).encode('unicode-escape') if ename in categoryList: spotDict['spots'][i][u'cat'] = categoryList[ename] else: print 'Cat not Found ', ename spotDict['spots'][i][u'cat'] = [] if ename in instanceList: spotDict['spots'][i][u'type'] = instanceList[ename] else: print 'Instance not Found ', ename spotDict['spots'][i][u'type'] = [] i += 1 print spotDict
def getEntitiesWithDexter(argv): #find the entities with dexter index = int(argv[3]) #if hosted on multiple ips ipList = ['localhost'] ipaddress = ipList[0] #if random.random() > 0.5 else ipList[0] tagURL = 'http://' + ipaddress + ':8080/dexter-webapp/api/rest/annotate' #catURL = 'http://'+ipaddress+':8080/dexter-webapp/api/rest/get-desc' print 'Using ' + tagURL start = int(argv[2]) outFile = open(argv[1][argv[1].rfind('/') + 1:argv[1].rfind('.')] + str(start) + '_out.txt', 'w') i = 0 e = 0 categoryList = loadCategories(argv[4]) instanceList = loadInstancesInList(argv[5]) for line in open(argv[1], 'r'): #if already tagged if i < start: pass split = line.strip().split('\t') query = split[index].lower() #get the result with url try: spotDict = tagQueryWithDexter(query, tagURL) spotDict = getCatAndTypeInfo(spotDict, categoryList, instanceList) if len(spotDict) > 0: outFile.write(str(spotDict) + '\n') except Exception as err: print i, query errStr = ''.join(str(err.args)) print err, err.args, errStr if e == 300: break if 'Connection' in errStr: time.sleep(15) outFile.close() outFile = open(argv[1][argv[1].rfind('/') + 1:argv[1].rfind('.')] + str(i) + '_out.txt', 'w') e += 1 #try starting the server again os.system( 'nohup java -Xmx4000m -jar ~/libraries/dexter2/dexter-2.1.0.jar &>' ' java.log &') time.sleep(15) else: print err, err.args i += 1 if i % 50000 == 0: if i > start: time.sleep(15) print i '''if len(domCat) > 0: avgCatFrq = sum(domCat.values()) / len(domCat) avgEnityFrq = sum(domEnt.values()) / len(domEnt) toRemoveCat = [k for k,v in domCat.iteritems() if v < avgCatFrq] print session, domEnt, domCat, avgCatFrq, avgEnityFrq print 'To Remove Cat List', toRemoveCat for query, spotList in querySpotList.iteritems(): matchl = spotList.keys() for entry in matchl: if domEnt[entry] < avgEnityFrq: spotList.pop(entry,None) print 'Removing spot', entry else: for cat in toRemoveCat: spotList[entry]['cat']= spotList[entry]['cat'].replace(cat,'').strip() outFile.write( query +'\t'+ str(spotList)+'\n') ''' outFile.close()
def main(): parser = ap.ArgumentParser(description = 'Generate features for entity tagged queries') parser.add_argument('-i', '--iFile', help='Query log file', required=True) parser.add_argument('-o', '--oFile', help='Output feature file', required=True) parser.add_argument('-t', '--typeFile', help='DBPedia type file', required=True) parser.add_argument('-c', '--catFile', help='DBPedia cat file', required=True) parser.add_argument('-u', '--uid', help='User id present or not', required=True,type=bool) parser.add_argument('-w', '--wtype', help='Phrase (phrase) or query (query) features', required=True) args = parser.parse_args() boolUid = args.uid #load the category list dbCatList = loadCategories(args.catFile) #print 'Categories',len(dbCatList) #load the type list dbTypeList = loadInstancesInList(args.typeFile) #print 'Types',len(dbTypeList) #query list queryList = {} #user list userList = {} #url list urlList = {} #session list sessionList = {} #entity List entityList = {} #category List categoryList = {} #type list typeList = {} ipaddress = 'localhost' tagURL = 'http://'+ipaddress+':8080/dexter-webapp/api/rest/annotate' cqid = 1 sid = 1 qid = None for session in getSessionTuples(args.iFile,'\t', 1560): print 'Session id and length' , sid, len(session) for entry in session: query = entry[QUERY] #tag it with dexter and get all 3 parameters spotDict = tagQueryWithDexter(query,tagURL) if 'spots' in spotDict: updatedSpotDict = getCatAndTypeInfo(spotDict,dbCatList, dbTypeList) if args.wtype == 'query': #given wtype find the following if query not in queryList: #print 'Mapping ', query , 'to ', cqid queryList[query] = cqid qid = cqid cqid+=1 else: qid = queryList[query] updateDict(sessionList,sid, qid) if boolUid: updateDict(userList, entry[USER], qid) if CLICKU in entry: updateDict(urlList, entry[CLICKU],qid) if updatedSpotDict: for spot in updatedSpotDict['spots']: updateDict(categoryList,spot['cat'], qid) updateDict(typeList,spot['type'], qid) updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid) if args.wtype == 'phrase': for spot in updatedSpotDict['spots']: splits = query.split(spot['mention']) for split in splits: split = split.strip() #remove stop words split = filterStopWordsFromQuery(split) if len(split) > 1: if split not in queryList: queryList[split] = cqid qid = cqid cqid+=1 else: qid = queryList[split] updateDict(sessionList,sid, qid) if boolUid: updateDict(userList, entry[USER], qid) if CLICKU in entry: updateDict(urlList, entry[CLICKU],qid) if updatedSpotDict: updateDict(categoryList,spot['cat'], qid) updateDict(typeList,spot['type'], qid) updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid) sid+=1 #write the features to the outfile outF = open(args.oFile,'w') for query, qid in queryList.items(): outF.write(query) #generate ngrams queryVect = getDictFromSet(query.split()) ngramString = getNGramsAsList(query,3) #ngrams = 1 outF.write('\t'+str(ngramString)) #query vect = 2 outF.write('\t'+str(queryVect)) if qid in urlList: outF.write('\t'+str(urlList[qid])) else: outF.write('\t{}') if qid in userList: outF.write('\t'+str(userList[qid])) else: outF.write('\t{}') if qid in entityList: outF.write('\t'+str(entityList[qid])) else: outF.write('\t{}') if qid in categoryList: outF.write('\t'+str(categoryList[qid])) else: outF.write('\t{}') if qid in typeList: outF.write('\t'+str(typeList[qid])) else: outF.write('\t{}') if qid in sessionList: outF.write('\t'+str(sessionList[qid])) else: outF.write('\t{}') outF.write('\n') outF.close()