Beispiel #1
0
def prepareTrainingDataset(sameTaskFile, dataSubsetFile, taskQueryFile):
  #select only those pairs which have both queries in queryFile
  tQueryList = {}
  #queryKey = {}
  keyQuery = {}
  for line in open(taskQueryFile, 'r'):
    tQueryList[line.strip()] = 1.0

  print len(tQueryList)

  for line in open(sameTaskFile, 'r'):
    split = line.split('\t')
    query = split[4].strip()
    key = '_'.join(split[:3])
    if query in tQueryList:
      if key not in keyQuery:
        keyQuery[key] = {}
      keyQuery[key][query] = 1.0

  keys = keyQuery.keys()
  newDict = {}
  skip = {}
  for i in range(len(keys)):
    if i not in skip:
      newDict[i] = keyQuery[keys[i]]
      for j in range(i + 1, len(keys) - 1):
        if j not in skip:
          cos = get_cosine(keyQuery[keys[i]], keyQuery[keys[j]])
          if cos > 0.70:
            newDict[i].update(keyQuery[keys[j]])
            skip[j] = True

  for entry, queries in newDict.items():
    if len(queries) > 1:
      print '\t'.join(queries.keys())
  def scoreWithCosine(self, qSet, clustList, cIndex, limit):
    toEvaluate = []
    done = set()
    for entry in qSet:
      try:
        clusters = cIndex[entry]
        #print 'CLUSTERS',entry, clusters
        for cind in clusters:
          if cind not in done:
            toEvaluate.append(clustList[cind])
            done.add(cind)
      except:
        pass

    #for each cluster find cosine similarity
    clustScore = {}
    i = 0
    qDict = getDictFromSet(qSet)
    for clust in toEvaluate:
      cos = get_cosine(qDict, clust)
      if cos > 0:
        clustScore[i] = cos
      i += 1

    toReturn = []
    for entry in sorted(clustScore.items(), reverse=True, key=lambda x: x[1]):
      toReturn.append(toEvaluate[entry[0]].keys())

    return toReturn
Beispiel #3
0
  def scoreCategories(self, querySet, queryDict, spotDict, k):
    entityCatScore = {}
    for entry, eDict in spotDict.iteritems():
      catList = eDict['cat'].lower().split()
      queryTerms = querySet - set([entry])
      catScore = {}
      for cat in catList:
        pset = self.catManager.getPhraseSet(cat)  #unique phrases in cat
        qInt = pset & queryTerms  #no of query terms cat contains
        score = 0.0
        for iphrase in qInt:
          score += self.catManager.getPhraseProb(cat, iphrase)
        if len(queryTerms) > 0:
          score *= (1.0 * len(qInt)) / len(queryTerms)

        #cosine score
        cVector = self.catManager.getVector(cat)
        cscore = get_cosine(queryDict, cVector)

        #total score
        catScore[cat] = (cscore + score) / 2.0
      sortedScore = sorted(catScore.items(), reverse=True, key=lambda x: x[1])

      #get terms from all categories
      if k == 1000 or k > len(sortedScore):
        k = len(sortedScore)

      entityCatScore[entry] = sortedScore[0:k]

      print 'Query\t', querySet, ' Entity\t', entry, entityCatScore[entry]
    return entityCatScore
 def findCosineDistance(self, qFeat):
   qCos = get_cosine(self.queryVector, qFeat.queryVector)
   uCos = get_cosine(self.urlDict, qFeat.urlDict)
   userCos = get_cosine(self.userDict, qFeat.userDict)
   sessionCos = get_cosine(self.sessionDict, qFeat.sessionDict)
   ngramsCos = get_cosine(self.ngrams, qFeat.ngrams)
   entCos = get_cosine(self.entDict, qFeat.entDict)
   catCos = get_cosine(self.catDict, qFeat.catDict)
   typeCos = get_cosine(self.typeDict, qFeat.typeDict)
   return (qCos, uCos, userCos, sessionCos, ngramsCos, entCos, catCos, typeCos)
Beispiel #5
0
def getFeatureSimilarity(feat1, feat2):
  simDict = {}
  for ftype, fdict1 in feat1.iteritems():
    if ftype in feat2:
      fdict2 = feat2[ftype]
      cos = get_cosine(fdict1, fdict2)
      simDict[ftype + '_cos'] = cos

  return str(simDict)
Beispiel #6
0
 def getSimilarity(self):
   keys = self.vector.keys()
   for i in range(len(keys)):
     v1 = self.vector[keys[i]]
     for j in range(i, len(keys)):
       v2 = self.vector[keys[j]]
       sim = get_cosine(v1, v2)
       if sim > 0:
         print keys[i], keys[j], sim
Beispiel #7
0
def preprocess(fileName):
    tknzr = TweetTokenizer()
    #nltk.download('stopwords')
    stop = stopwords.words('english')
    #stop += [ '<url>',  '<user>', '<repeat>', '<elong>']
    #the tweet processor 
    tweet_processor = TweetPreprocessor()
    #load the tweets 
    """tweets_file = '/Users/aedouard/Documents/_dev/these/data/websummit_dump_20151106155110'
    with open(tweets_file) as f:
        tweets = json.load(f)
    """
    tweets = loadTweets(fileName)
    #tweet_texts_processed = []
    #tweet_texts = []
    new_data = {}
    found = 0
    #stemming 
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    #print(tweets.keys())
    #print('\n'.join([t['text'] for t in tweets['1639']]))
    #sys.exit(1)
    for key in tweets.keys():
        d = tweets[key]
        new_data[key] = []
        print (key, len(d))
        for i  in range(len(d)):
            add = True
            tweet1 = d[i]
            for j in range(0, i):
                tweet2 = d[j]
                cosine = utils.get_cosine(tweet1['text'],tweet2['text'])
                if cosine > 0.9:
                    add = False
                    found = found+1
                    break;
            if add:
                
                no_punctuation = tweet1["text"].translate(string.punctuation)
                tokens = nltk.word_tokenize(no_punctuation)
                #count = Counter(tokens)
                filtered = [w for w in tokens if not w in stopwords.words('english')]
                #parts = tknzr.tokenize(tweet_processor.preprocess(tweet1["text"]))
               
                #clean = [i for i in parts if i not in stop]
                 
                texts = [p_stemmer.stem(i) for i in filtered]
                tweet1["processed"] = texts
                new_data[key].append(tweet1)
               

    return new_data    
Beispiel #8
0
def main(argv):
  taskList = loadTasks(argv[1])
  userVectFile = open('userVect.txt', 'w')
  userTaskSimFile = open('userTaskSim.txt', 'w')
  uSim = {}
  for uId, termVector in getUserVector(argv[2], 0, 1):
    uSim[uId] = {}
    userVectFile.write(
        str(uId) + '\t' + '\t'.join('{0}\t{1}'.format(x, y)
                                    for x, y in termVector.items()) + '\n')

    for taskid, task in taskList.iteritems():
      sim = round(get_cosine(task, termVector), 5)
      uSim[uId][taskid] = sim
    tSort = sorted(uSim[uId].items(), reverse=True, key=lambda x: x[0])
    userTaskSimFile.write(str(uId) + '\t' + '\t'.join(str(x[1])
                                                      for x in tSort) + '\n')
def calculateSimilarityMatrix(taskVectorDict, tokenDict):
  print 'In similarity'
  nTasks = len(taskVectorDict)
  print nTasks
  sim = numpy.zeros(shape=(nTasks, nTasks))
  for entry, vector in taskVectorDict.iteritems():
    print entry, vector
    taskIndexList = {}
    for token in vector.keys():
      for ntid in tokenDict[token].keys():
        if ntid > entry:
          taskIndexList[ntid] = 1
    print entry, len(taskIndexList)
    for i in taskIndexList.values():
      print entry, i, sim
      sim[entry][i] = sim[i][entry] = get_cosine(vector, taskVectorDict[i])
      print 'sim', entry, i, vector, taskVectorDict[i], sim[entry][i]
  return sim
Beispiel #10
0
def getPairFeatures(session):

  totalTime = 1.0 + (session[-1][QTIME] - session[0][QTIME]).total_seconds()
  for i in range(len(session) - 1):
    for j in range(i + 1, len(session)):
      e1 = session[i]
      e2 = session[j]
      jaccard = 1.0 - distance.jaccard(e1[QUERY].split(), e2[QUERY].split())
      edit = 1.0 - distance.nlevenshtein(e1[QUERY].split(), e2[QUERY].split())
      timeDiff = ((e2[QTIME] - e1[QTIME]).total_seconds()) / totalTime * 1.0
      #normalized distance
      dist = (j - i) * 1.0 / len(session)
      urlMatch = -1
      if CLICKU in e1 and CLICKU in e2:
        urlMatch = 1.0 - distance.nlevenshtein(e1[CLICKU], e2[CLICKU])
      cosine = get_cosine(text_to_vector(e1[QUERY]), text_to_vector(e2[QUERY]))
      edgeScore = .20 * cosine + .20 * jaccard + .20 * edit + .15 * dist + .15 * timeDiff + .10 * urlMatch
      yield i, j, edgeScore, cosine, jaccard, edit, dist, timeDiff, urlMatch
  def aggregateTerms(self, query, entityCatScore):
    #max -- Take the terms from max category
    #weight = {}
    tList = {}
    for entity, catScoreList in entityCatScore.iteritems():
      for catS in catScoreList:
        pList = self.catManager.getPhrases(catS[0])
        for x in pList:
          if x[0] not in stopSet and x[0] not in query:
            tList[x[0]] = tList.setdefault(x[0], 0.0) + x[1]

    #termList = list(tList.keys())
    print 'Term size', len(tList)

    sTerm = sorted(tList.items(), reverse=True, key=lambda x: x[1])
    sw = SimpleWalk()
    k = len(sTerm) if len(sTerm) < 1000 else 1000
    for i in range(0, k):
      ivect = self.vectManager.getVector(sTerm[i][0])
      if ivect:
        #weight[termList[i]] = {}
        for j in range(i + 1, k):
          jvect = self.vectManager.getVector(sTerm[j][0])
          if jvect:
            #print sTerm[i][0], ivect
            #print sTerm[j][0], jvect
            sim = get_cosine(ivect, jvect)
            if sim > 0.001:
              #weight[termList[i]][termList[j]] = sim
              sw.addEdge(sTerm[i][0], sTerm[j][0], sim)

    print 'Done graph, starting walk'
    #return tList
    try:
      results = sw.walk()
      return results
    except:
      return {}
Beispiel #12
0
 def getVectSim(self, term1, term2, vectManager):
   ivect = vectManager.getVector(term1)
   jvect = vectManager.getVector(term2)
   sim = get_cosine(ivect, jvect)
   return sim
Beispiel #13
0
def use_system2(sow,labmbda,n_results):


    # ===================== read the model =================

    master_phrases_vectors = use.loadmodel('master_phrases_vectors_2')
    tfidftransformer_1 = use.loadmodel('tfidftransformer_1_2')
    tfidftransformer_2 = use.loadmodel('tfidftransformer_2_2')
    master_nerank_vectors=use.loadmodel('master_nerank_vectors_2')

    vocab_map_2 = {v: k for k, v in tfidftransformer_2.vocabulary_.iteritems()}
    vocab_map_1 = {v: k for k, v in tfidftransformer_1.vocabulary_.iteritems()}

    # ===================== read sow =======================

    sow = sow#.decode('utf-8')


    # ===================== vectorize the SOW =======================

    sow_tf1=tfidftransformer_1.transform([sow])[0]
    sow_tf2=tfidftransformer_2.transform([sow])[0]

    if len(sow_tf1.indices) == 0 or len(sow_tf2.indices) == 0:
        print 'sow is not large enough for this system. Please, try System 1'
        return [],[]

    sow_final_vec=ne_rank(sow_tf1,sow_tf2,tfidftransformer_1,vocab_map_1,vocab_map_2)

    phrases=use.noun_tokenize(sow)
    phrases=list(set(phrases))
    phrases_vectors=[list(tfidftransformer_1.transform([x])[0].indices) for x in phrases]
    sow_phrase_dict = {}
    for x, phrase in zip(phrases_vectors, phrases):
        x = [sow_final_vec[y] for y in x if y in sow_final_vec.keys()]
        avg = np.sum(x)
        sow_phrase_dict[phrase] = avg


    # ===================== find cosine similarities =======================

    similarities=[]
    all_important_terms_tf=[]
    all_important_terms_keywords=[]
    for nerank_vec,phrase_dict in zip(master_nerank_vectors,master_phrases_vectors):
        sim_nerank,product_tf=use.get_cosine(nerank_vec,sow_final_vec)
        keys = product_tf.keys()
        values = product_tf.values()
        important_terms_tf = list(reversed(np.argsort(values)))
        important_terms_tf = [vocab_map_1[keys[x]] for x in important_terms_tf]
        all_important_terms_tf.append(important_terms_tf)

        sim_keyword,product_keyword=use.get_cosine(phrase_dict,sow_phrase_dict)
        keys = product_keyword.keys()
        values = product_keyword.values()
        important_terms_keyword = list(reversed(np.argsort(values)))
        important_terms_keyword = [keys[x] for x in important_terms_keyword]
        all_important_terms_keywords.append(important_terms_keyword)

        sim=(labmbda*sim_nerank)+((1-labmbda)*sim_keyword)
        similarities.append(sim)


    # ===================== rank the documents and print the top n =======================

    ranked_docs=list(reversed(np.argsort(similarities)))

    results_sim = []
    results_index = []
    for i in range(n_results):
        index=ranked_docs[i]
        # print similarities[index]
        results_sim.append(format(100*similarities[index],'.2f'))
        results_index.append(index)
        # print all_important_terms_tf[index]
        # print all_important_terms_keywords[index]

    return results_index, results_sim
 def getSimilarity(self, tDict, entry):
   return get_cosine(tDict, entry)
Beispiel #15
0
def syn_cosine(text1, text2):
    vec1 = to_syn_vec(text1)
    vec2 = to_syn_vec(text2)
    return get_cosine(vec1, vec2)
Beispiel #16
0
def syn_cosine(text1, text2):
    vec1 = to_syn_vec(text1)
    vec2 = to_syn_vec(text2)
    return get_cosine(vec1, vec2)
def getStatsPerQuery(argv):
  tagURL = 'http://localhost:8080/rest/annotate'
  catURL = 'http://localhost:8080/rest/graph/get-entity-categories'

  catVector = loadCategoryVector(argv[3])
  f1Dict = getCats(argv[2])
  sFound = 0.0
  sTotal = 0.0
  eTotal = set()
  eRemov = set()
  catFoundNoTerm = set()
  catNotFound = set()
  catTermFound = set()
  catEntity = set()
  outfile = open('match_session_dom.txt', 'w')
  #categoryVectors = {}
  for session in getSessionWithNL(argv[1]):
    catCount = {}
    entCount = {}
    querySpotList = {}
    for query in session:
      #find the entities in query
      try:
        spotDict = None  #tagQueryWithDexter(query, tagURL,catURL)
        querySpotList[query] = spotDict
        for text in spotDict.keys():
          for entry in spotDict[text]['cat'].split():
            catCount[entry] = catCount.setdefault(entry, 1) + 1
          entCount[text] = entCount.setdefault(text, 1) + 1
      except Exception as err:
        print err
        #print 'SESSION', session, 'CATCOUNT', catCount, 'ENTCOUNT',entCount

    found = False
    if len(catCount) > 0:
      #find the dominant entity
      maxEnt = max(entCount.values())
      #sessionQueryMapping = {}
      for query, spotList in querySpotList.iteritems():
        matchl = spotList.keys()
        for entry in matchl:
          eTotal.add(entry)
          if entCount[entry] < maxEnt:
            spotList.pop(entry, None)
            print 'Removing spot', query, entry
            eRemov.add(entry)
          else:
            #get the categories
            #catTermMatch = {}
            rquery = query.replace(entry, '')
            queryTerms = set(rquery.split())
            for cat in spotList[entry]['cat'].lower().split():
              catEntity.add(entry + '_' + cat)
              if cat in f1Dict:
                phrase1 = loadPhrasesWithScore(argv[2] + '/' + f1Dict[cat])
                pVector = catVector[cat]
                queryDict = getDictFromSet(queryTerms)
                pTotal = sum(phrase1.values())
                pset = set(phrase1.keys())
                sint = pset & queryTerms
                score = 0.0
                cscore = get_cosine(queryDict, pVector)

                for iphrase in sint:
                  score += phrase1[iphrase] / pTotal
                if len(queryTerms) > 0:
                  score *= (1.0 * len(sint)) / len(queryTerms)

                if sint:

                  outfile.write(query + '\t' + entry + '\t' + cat + '\t' +
                                str(cscore) + '\t' + ', '.join(sint) + '\n')
                  found = True
                  catTermFound.add(entry + '_' + cat)
                else:
                  outfile.write(query + '\t' + entry + '\t' + cat + '\t0\t0\n')
                  catFoundNoTerm.add(cat + '_' + entry)
              else:
                outfile.write(
                    query + '\t' + entry + '\t' + cat + '\t0\tNOT FOUND\n')
                catNotFound.add(cat + '_' + entry)

                #load the terms for category
                #check if these terms match
    if found:
      sFound += 1
    sTotal += 1
    outfile.write('\n')

  print 'Total Sessions ', sTotal
  print 'Sessions with dominant entity in AOL', sFound
  print '# Unique Entities', len(eTotal)
  print '# Removed Entities (non dominant)', len(eRemov)
  print '# no of entity types', len(catEntity)
  print '# no of entity types with terms match ', len(catTermFound)
  print '# no of entity types with no term match', len(catFoundNoTerm)
  print '# no of entity types with no match in AOL', len(catNotFound)
def getPrecRecall(opt, catList, f1Dict, catVector, queryTerms, aTerms, index):

  catScore = {}
  maxQs = -1000
  maxCat = ''

  notFound = set()
  for cat in catList:
    if cat in f1Dict:
      catScore[cat
           ] = {'aP': 0.0,
                'aR': 0.0,
                'qS': 0.0,
                'qInt': set(),
                'aInt': set()}
      #phrase cat score
      phrase1 = loadPhrasesWithScore(f1Dict[cat])
      pTotal = sum(phrase1.values())  #total no of terms in cat
      pset = set(phrase1.keys())  #unique phrases in cat
      qInt = pset & queryTerms  #no of query terms cat contains
      score = 0.0
      for iphrase in qInt:
        score += phrase1[iphrase] / pTotal
      if len(queryTerms) > 0:
        score *= (1.0 * len(qInt)) / len(queryTerms)

      #cosine score
      queryDict = getDictFromSet(queryTerms)
      cVector = catVector[cat]
      cscore = get_cosine(queryDict, cVector)

      #total score
      catScore[cat]['qs'] = cscore + score
      if maxQs < catScore[cat]['qs']:
        maxQs = catScore[cat]['qs']
        maxCat = cat

      sortP = sorted(phrase1.items(), reverse=True, key=lambda x: x[1])
      #print 'sorted' , sortP[0],sortP[1]
      apset = set(x[0] for x in sortP[0:index])
      #print 'pSet ',apset

      aInt = aTerms & apset
      catScore[cat]['aP'] = (1.0 * len(aInt)) / len(aTerms)
      catScore[cat]['aR'] = (1.0 * len(aInt)) / len(apset)
      catScore[cat]['aInt'] = aInt
      catScore[cat]['qInt'] = qInt
    else:
      notFound.add(cat)

  if opt == 'max':
    if maxCat in catScore:
      return notFound, maxCat, catScore[maxCat]
    else:
      return notFound, None, {
          'aP': 0.0,
          'aR': 0.0,
          'qS': 0.0,
          'qInt': set(),
          'aInt': set()
      }

  else:
    avgScore = {'aP': 0.0, 'aR': 0.0, 'qS': 0.0, 'qInt': set(), 'aInt': set()}
    for entry, cdict in catScore.iteritems():
      avgScore['aP'] += cdict['aP']
      avgScore['aR'] += cdict['aR']
      avgScore['qS'] += cdict['qS']
      avgScore['qInt'] |= cdict['qInt']
      avgScore['aInt'] |= cdict['aInt']

    avgScore['aP'] /= len(catScore)
    avgScore['aR'] /= len(catScore)
    avgScore['qS'] /= len(catScore)

    return notFound, None, avgScore

  return notFound, None, None
Beispiel #19
0
def use_system1(sow,labmbda,n_results):


    # ===================== read the model =================

    master_phrases_vectors=use.loadmodel('master_phrases_vectors_1')
    texts_all_tf=use.loadmodel('texts_all_tf_1')
    tfidftransformer=use.loadmodel('tfidftransformer_1')

    vocab_map = {v: k for k, v in tfidftransformer.vocabulary_.iteritems()}

    # ===================== read sow =======================

    sow=sow#.decode('utf-8')

    # ===================== vectorize the SOW =======================

    sow_tf=tfidftransformer.transform([sow])[0]
    sow_tf=sow_tf.todense()
    phrases=use.noun_tokenize(sow)
    phrases=list(set(phrases))
    phrases_vectors=[list(tfidftransformer.transform([x])[0].indices) for x in phrases]
    sow_phrase_dict = {}
    for x, phrase in zip(phrases_vectors, phrases):
        x = np.array(sow_tf).flatten()[x]
        avg = np.mean(x)
        sow_phrase_dict[phrase] = avg

    # ===================== find cosine similarities =======================

    similarities=[]
    all_important_terms_tf=[]
    all_important_terms_keywords=[]
    for text_tf,phrase_dict in zip(texts_all_tf,master_phrases_vectors):
        sim_tf=cosine_similarity(text_tf,sow_tf)
        product=np.array(text_tf.todense()).flatten()*np.array(sow_tf).flatten()
        important_terms_tf=list(reversed(np.argsort(product)))[:10]
        important_terms_tf=[vocab_map[x] for x in important_terms_tf]
        all_important_terms_tf.append(important_terms_tf)
        sim_tf=sim_tf.flatten()[0]
        sim_keyword,product_keyword=use.get_cosine(phrase_dict,sow_phrase_dict)
        keys=product_keyword.keys()
        values=product_keyword.values()
        important_terms_keyword = list(reversed(np.argsort(values)))
        important_terms_keyword=[keys[x] for x in important_terms_keyword]
        all_important_terms_keywords.append(important_terms_keyword)
        sim=(labmbda*sim_tf)+((1-labmbda)*sim_keyword)
        similarities.append(sim)


    # ===================== rank the documents and print the top n =======================

    ranked_docs=list(reversed(np.argsort(similarities)))
    results_sim=[]
    results_index=[]
    for i in range(n_results):
        index=ranked_docs[i]
        # print similarities[index]
        results_sim.append(format(100*similarities[index],'.2f'))
        results_index.append(index)
        # print all_important_terms_tf[index]
        # print all_important_terms_keywords[index]

    return results_index,results_sim
Beispiel #20
0
 def getCosine(self, word2):
   eCos = round(get_cosine(self.ent, word2.ent), 3)
   cCos = round(get_cosine(self.cat, word2.cat), 3)
   uCos = round(get_cosine(self.url, word2.url), 3)
   return eCos, cCos, uCos
Beispiel #21
0
 def getUrlCosine(self, word2):
   return round(get_cosine(self.url, word2.url), 3)
Beispiel #22
0
 def getCatCosine(self, word2):
   return round(get_cosine(self.cat, word2.cat), 3)
Beispiel #23
0
 def getEntCosine(self, word2):
   return round(get_cosine(self.ent, word2.ent), 3)