Esempio n. 1
0
def loadPairs(queryId, fileName, labeledPoints, pairLabels):

  for line in open(fileName, 'r'):
    split = line.split('\t')
    cpoints = set()
    for entry in split:
      normQuery = filterWords(entry.strip())
      #normQuery = entry.strip()#normalize(entry,stemmer)
      if normQuery in queryId:
        qid = queryId[normQuery]
        labeledPoints.add(qid)
        cpoints.add(qid)
      #else:
      #if len(normQuery) > 0:
      #	print normQuery
      #		pass
      #print 'True same ',normQuery
      pairs = generatePairsFromList(sorted(cpoints))

    for pair in pairs:
      pairLabels.add(pair)
Esempio n. 2
0
def calculateIndiciesFromFiles(trueLabelFile, differentPairFile, predictedLabelFile,
                      queryList):

  queryId = {}
  idQuery = {}
  i = 1
  for line in open(queryList, 'r'):
    split = line.split('\t')
    typeList = ast.literal_eval(split[7])
    if len(typeList) > 0:
      query = filterWords(split[0].strip())
      if query not in queryId:
        queryId[query] = str(i)
        idQuery[str(i)] = query
    i += 1

  l_samePairs = set()
  l_points = set()
  l_diffPairs = set()
  p_samePairs = set()

  #load true label file
  #Same cluster

  loadPairs(queryId, trueLabelFile, l_points, l_samePairs)
  loadPairs(queryId, differentPairFile, l_points, l_diffPairs)

  total_pairs = len(l_samePairs) + len(l_diffPairs)  #(len(l_points)*(len(l_points)-1))/2
  #filter predicted label file
  for line in open(predictedLabelFile, 'r'):
    line = line.strip()
    cpoints = set()
    if len(line) > 0 and 'NO CLUST' not in line:
      split = line.split('\t')
      for entry in split:
        try:
          entry = filterWords(entry.strip())
          qid = queryId[entry]
          if qid in l_points:
            cpoints.add(qid)
        except:
          #print entry
          pass
      pairs = generatePairsFromList(sorted(cpoints))
      for pair in pairs:
        if pair in l_samePairs or pair in l_diffPairs:
          #if pair in l_diffPairs:
          #	s = pair.split()
          #	print pair, idQuery[s[0]], idQuery[s[1]]
          p_samePairs.add(pair)  #
          if pair in l_diffPairs:
            p1 = pair[0:pair.find(' ')]
            p2 = pair[pair.find(' ') + 1:]
            #if p1 in idQuery and p2 in idQuery:
            #	print idQuery[p1], idQuery[p2]
            #else:
            #	print p1, 'sec', p2

  print len(l_samePairs), len(p_samePairs), total_pairs
  #print l_samePairs
  tp = len(l_samePairs & p_samePairs)
  fp = len(p_samePairs & l_diffPairs)  #len(p_samePairs) - tp ;
  fn = len(l_samePairs) - tp
  tn = len(l_diffPairs) - fp  #total_pairs - (tp+fp+fn)
  print tp, fp, fn, tn, total_pairs

  return tp, fp, fn, tn, total_pairs