Beispiel #1
0
def findEntityCategory(entFile, categoryFile, instanceFile):
  categoryList = loadCategories(categoryFile)
  instanceList = loadInstancesInList(instanceFile)
  for line in open(entFile, 'r'):
    #entry= str(line.strip().lower())
    #entry =  entry[2:-1].strip()
    #entry = unicode(entry1)

    spotDict = ast.literal_eval(line.lower())
    spots = spotDict['spots']
    i = 0
    for spot in spots:
      ename = (spot['wikiname']).encode('unicode-escape')
      if ename in categoryList:
        spotDict['spots'][i][u'cat'] = categoryList[ename]
      else:
        print 'Cat not Found ', ename
        spotDict['spots'][i][u'cat'] = []

      if ename in instanceList:
        spotDict['spots'][i][u'type'] = instanceList[ename]
      else:
        print 'Instance not Found ', ename
        spotDict['spots'][i][u'type'] = []
      i += 1
    print spotDict
def getEntitiesWithDexter(argv):
  #find the entities with dexter
  index = int(argv[3])
  #if hosted on multiple ips
  ipList = ['localhost']
  ipaddress = ipList[0]  #if random.random() > 0.5 else ipList[0]
  tagURL = 'http://' + ipaddress + ':8080/dexter-webapp/api/rest/annotate'
  #catURL = 'http://'+ipaddress+':8080/dexter-webapp/api/rest/get-desc'
  print 'Using ' + tagURL

  start = int(argv[2])
  outFile = open(argv[1][argv[1].rfind('/') + 1:argv[1].rfind('.')] + str(start)
                 + '_out.txt', 'w')

  i = 0
  e = 0

  categoryList = loadCategories(argv[4])
  instanceList = loadInstancesInList(argv[5])

  for line in open(argv[1], 'r'):
    #if already tagged
    if i < start:
      pass
    split = line.strip().split('\t')
    query = split[index].lower()  #get the result with url
    try:
      spotDict = tagQueryWithDexter(query, tagURL)
      spotDict = getCatAndTypeInfo(spotDict, categoryList, instanceList)
      if len(spotDict) > 0:
        outFile.write(str(spotDict) + '\n')

    except Exception as err:
      print i, query
      errStr = ''.join(str(err.args))
      print err, err.args, errStr
      if e == 300:
        break
      if 'Connection' in errStr:
        time.sleep(15)
        outFile.close()
        outFile = open(argv[1][argv[1].rfind('/') + 1:argv[1].rfind('.')] +
                       str(i) + '_out.txt', 'w')
        e += 1
        #try starting the server again
        os.system(
            'nohup java -Xmx4000m -jar ~/libraries/dexter2/dexter-2.1.0.jar &>'
            ' java.log &')
        time.sleep(15)
      else:
        print err, err.args
    i += 1
    if i % 50000 == 0:
      if i > start:
        time.sleep(15)
        print i
    '''if len(domCat) > 0:

                        avgCatFrq = sum(domCat.values()) / len(domCat)
                        avgEnityFrq = sum(domEnt.values()) / len(domEnt)
                        toRemoveCat = [k for k,v in domCat.iteritems() if v <
                        avgCatFrq]

                        print session, domEnt, domCat, avgCatFrq, avgEnityFrq
                        print 'To Remove Cat List', toRemoveCat
                        for query, spotList in querySpotList.iteritems():
                                matchl = spotList.keys()
                                for entry in matchl:
                                        if domEnt[entry] < avgEnityFrq:
                                                spotList.pop(entry,None)
                                                print 'Removing spot', entry
                                        else:
                                                for cat in toRemoveCat:
                                                        spotList[entry]['cat']=
                                                        spotList[entry]['cat'].replace(cat,'').strip()
                                outFile.write( query +'\t'+ str(spotList)+'\n')
                '''
  outFile.close()
def main():
    parser = ap.ArgumentParser(description = 'Generate features for entity tagged queries')
    parser.add_argument('-i', '--iFile', help='Query log file', required=True)
    parser.add_argument('-o', '--oFile', help='Output feature file', required=True)
    parser.add_argument('-t', '--typeFile', help='DBPedia type file', required=True)
    parser.add_argument('-c', '--catFile', help='DBPedia cat file', required=True)
    parser.add_argument('-u', '--uid', help='User id present or not', required=True,type=bool)
    parser.add_argument('-w', '--wtype', help='Phrase (phrase) or query (query) features', required=True)

    args = parser.parse_args()

    boolUid = args.uid

    #load the category list
    dbCatList = loadCategories(args.catFile)
    #print 'Categories',len(dbCatList)
    #load the type list
    dbTypeList = loadInstancesInList(args.typeFile)
    #print 'Types',len(dbTypeList)

    #query list
    queryList = {}
    #user list
    userList = {}
    #url list
    urlList = {}
    #session list
    sessionList = {}
    #entity List
    entityList = {}
    #category List
    categoryList = {}
    #type list
    typeList = {}

    ipaddress = 'localhost'
    tagURL = 'http://'+ipaddress+':8080/dexter-webapp/api/rest/annotate'

    cqid = 1
    sid = 1
    qid = None
    for session in getSessionTuples(args.iFile,'\t', 1560):
        print 'Session id and length' , sid, len(session)
        for entry in session:
            query = entry[QUERY]
            #tag it with dexter and get all 3 parameters
            spotDict = tagQueryWithDexter(query,tagURL)
            if 'spots' in spotDict:
                updatedSpotDict = getCatAndTypeInfo(spotDict,dbCatList, dbTypeList)
            if args.wtype == 'query':
                #given wtype find the following
                if query not in queryList:
                    #print 'Mapping ', query , 'to ', cqid
                    queryList[query] = cqid
                    qid = cqid
                    cqid+=1
                else:
                    qid = queryList[query]
                updateDict(sessionList,sid, qid)

                if boolUid:
                    updateDict(userList, entry[USER], qid)
                if CLICKU in entry:
                    updateDict(urlList, entry[CLICKU],qid)
                if updatedSpotDict:
                    for spot in updatedSpotDict['spots']:
                        updateDict(categoryList,spot['cat'], qid)
                        updateDict(typeList,spot['type'], qid)
                        updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid)

            if args.wtype == 'phrase':
                for spot in updatedSpotDict['spots']:
                    splits = query.split(spot['mention'])
                    for split in splits:
                        split = split.strip()
                        #remove stop words
                        split = filterStopWordsFromQuery(split)
                        if len(split) > 1:
                            if split not in queryList:
                                queryList[split] = cqid
                                qid = cqid
                                cqid+=1
                            else:
                                qid = queryList[split]
                            updateDict(sessionList,sid, qid)

                            if boolUid:
                                updateDict(userList, entry[USER], qid)
                            if CLICKU in entry:
                                updateDict(urlList, entry[CLICKU],qid)
                            if updatedSpotDict:
                                updateDict(categoryList,spot['cat'], qid)
                                updateDict(typeList,spot['type'], qid)
                                updateDict(entityList,encodeUTF(spot['wikiname'].lower()),qid)
        sid+=1

    #write the features to the outfile
    outF = open(args.oFile,'w')

    for query, qid in queryList.items():
        outF.write(query)
        #generate ngrams
        queryVect = getDictFromSet(query.split())
        ngramString = getNGramsAsList(query,3)
        #ngrams = 1
        outF.write('\t'+str(ngramString))
        #query vect = 2
        outF.write('\t'+str(queryVect))


        if qid in urlList:
            outF.write('\t'+str(urlList[qid]))
        else:
            outF.write('\t{}')

        if qid in userList:
            outF.write('\t'+str(userList[qid]))
        else:
            outF.write('\t{}')

        if qid in entityList:
            outF.write('\t'+str(entityList[qid]))
        else:
            outF.write('\t{}')

        if qid in categoryList:
            outF.write('\t'+str(categoryList[qid]))
        else:
            outF.write('\t{}')

        if qid in typeList:
            outF.write('\t'+str(typeList[qid]))
        else:
            outF.write('\t{}')

        if qid in sessionList:
            outF.write('\t'+str(sessionList[qid]))
        else:
            outF.write('\t{}')

        outF.write('\n')

    outF.close()