def analyzeCSV(groupingType, model, type=2, analysisType="3",limit = "100"):
    """
    Parameters.
        model :  model to analyize
        type:    type: 1 -> single level sim analysis
                       2 -> range analysis
    
    Get all sim files in model
    iterate over categories
    Distinguish between sngle and range sim files
    Get top n row from sim file, n = numer of items searched for
    Calculate IR measures
    
    
    """
    keys = ['category', 'depth','idLevel','ocID','sim','nrOcc']
    path = "testData/%s/%s/sim/SummaryCSV/%s/" %(groupingType,model,limit)
    categories = getMainCat()
    
    for cat in categories:
        if type==1:
            sysOutFile = path+cat+".txt"
        else:
            sysOutFile = path+cat+"_Range.txt"
            
        sys.stdout = open(sysOutFile, 'wb')
        #get cat debth
        sqlmaxDepth = "select max(categoryDepth) from dmoz_categories where Topic like 'Top/"+str(cat)+"/%' and filterOut = 0"
        maxDebthRS = dbQuery(sqlmaxDepth)
        maxDebth = maxDebthRS[0]
        maxDebth = maxDebth[0]
        ranger = [x for x in range(2,maxDebth+1)]
        print "Category: ", cat
        for level in ranger:
            
            if type == 1:
                fileName = "%s%s_%s_%s.csv" % (path,model,cat,level)
                originalIDFile = "%s%s_%s_%s_original.csv" % (path,model,cat,level) 
            elif type == 2:
                fileName = "%s%s_%s_1_%s.csv" % (path,model,cat,level)
                originalIDFile = "%s%s_%s_1_%s_original.csv" % (path,model,cat,level)
            
            #original categories from sim csv or oid csv 
            if os.path.isfile(originalIDFile):
                nrUID = uniq(originalIDFile, 0)
            else:                
                nrUID = uniq(fileName, 2)
            
            nrUID = [int(i) for i in nrUID]
            #print fileName
            returnedID = uniq(fileName, 3)
            returnedID = [int(i) for i in returnedID[:len(nrUID)]]
            print "level: ", level,"\t",fileName
            #print nrUID
            #print returnedID
            print len(nrUID)
            print len(returnedID)
            if len(nrUID) ==len(returnedID): 
                print sklearn.metrics.classification_report(nrUID,returnedID)
Ejemplo n.º 2
0
 def visit(self, recurse, directory, names):
     settings = self.get_settings('', directory)
     if settings.prune and (os.path.abspath(directory) in settings.prune):
         print >> sys.stderr, '/// ...Skipping directory (pruned):', directory
         sys.stderr.flush()
         names[:] = []
         return
     if not self.initial_settings.silent:
         print >> sys.stderr, '/// Processing directory:', directory
         sys.stderr.flush()
     # settings.ignore grows many duplicate entries as we recurse
     # if we add patterns in config files or on the command line.
     for pattern in utils.uniq(settings.ignore):
         for i in range(len(names) - 1, -1, -1):
             if fnmatch(names[i], pattern):
                 # Modify in place!
                 del names[i]
     prune = 0
     for name in names:
         if name.endswith('.txt'):
             prune = self.process_txt(directory, name)
             if prune:
                 break
     if not recurse:
         del names[:]
Ejemplo n.º 3
0
 def visit(self, directory, names):
     # BUG prune and ignore do not work 
     settings = self.get_settings('', directory)
     errout = ErrorOutput(encoding=settings.error_encoding)
     if settings.prune and (os.path.abspath(directory) in settings.prune):
         print >>errout, ('/// ...Skipping directory (pruned): %s' %
                           directory)
         sys.stderr.flush()
         names[:] = []
         return
     if not self.initial_settings.silent:
         print >>errout, '/// Processing directory: %s' % directory
         sys.stderr.flush()
     # settings.ignore grows many duplicate entries as we recurse
     # if we add patterns in config files or on the command line.
     for pattern in utils.uniq(settings.ignore):
         for i in range(len(names) - 1, -1, -1):
             if fnmatch(names[i], pattern):
                 # Modify in place!
                 del names[i]
     prune = 0
     for name in names:
         if name.endswith('.txt'):
             prune = self.process_txt(directory, name)
             if prune:
                 break
Ejemplo n.º 4
0
 def indirect_target_error(self, target, explanation):
     naming = ''
     reflist = []
     if target['names']:
         naming = '"%s" ' % target['names'][0]
     for name in target['names']:
         reflist.extend(self.document.refnames.get(name, []))
     for id in target['ids']:
         reflist.extend(self.document.refids.get(id, []))
     naming += '(id="%s")' % target['ids'][0]
     msg = self.document.reporter.error(
         'Indirect hyperlink target %s refers to target "%s", %s.' %
         (naming, target['refname'], explanation),
         base_node=target)
     msgid = self.document.set_id(msg)
     for ref in utils.uniq(reflist):
         prb = nodes.problematic(ref.rawsource, ref.rawsource, refid=msgid)
         prbid = self.document.set_id(prb)
         msg.add_backref(prbid)
         ref.replace_self(prb)
     target.resolved = 1
Ejemplo n.º 5
0
 def indirect_target_error(self, target, explanation):
     naming = ''
     reflist = []
     if target['names']:
         naming = '"%s" ' % target['names'][0]
     for name in target['names']:
         reflist.extend(self.document.refnames.get(name, []))
     for id in target['ids']:
         reflist.extend(self.document.refids.get(id, []))
     naming += '(id="%s")' % target['ids'][0]
     msg = self.document.reporter.error(
           'Indirect hyperlink target %s refers to target "%s", %s.'
           % (naming, target['refname'], explanation), base_node=target)
     msgid = self.document.set_id(msg)
     for ref in utils.uniq(reflist):
         prb = nodes.problematic(
               ref.rawsource, ref.rawsource, refid=msgid)
         prbid = self.document.set_id(prb)
         msg.add_backref(prbid)
         ref.replace_self(prb)
     target.resolved = 1
Ejemplo n.º 6
0
 def visit(self, directory, names, subdirectories):
     settings = self.get_settings("", directory)
     errout = ErrorOutput(encoding=settings.error_encoding)
     if settings.prune and (os.path.abspath(directory) in settings.prune):
         errout.write("/// ...Skipping directory (pruned): %s\n" % directory)
         sys.stderr.flush()
         del subdirectories[:]
         return
     if not self.initial_settings.silent:
         errout.write("/// Processing directory: %s" % directory)
         sys.stderr.flush()
     # settings.ignore grows many duplicate entries as we recurse
     # if we add patterns in config files or on the command line.
     for pattern in utils.uniq(settings.ignore):
         for i in range(len(names) - 1, -1, -1):
             if fnmatch(names[i], pattern):
                 # Modify in place!
                 del names[i]
     for name in names:
         if name.endswith(".txt"):
             self.process_txt(directory, name)
Ejemplo n.º 7
0
 def visit(self, directory, names, subdirectories):
     settings = self.get_settings('', directory)
     errout = ErrorOutput(encoding=settings.error_encoding)
     if settings.prune and (os.path.abspath(directory) in settings.prune):
         errout.write('/// ...Skipping directory (pruned): %s\n' %
                      directory)
         sys.stderr.flush()
         del subdirectories[:]
         return
     if not self.initial_settings.silent:
         errout.write('/// Processing directory: %s\n' % directory)
         sys.stderr.flush()
     # settings.ignore grows many duplicate entries as we recurse
     # if we add patterns in config files or on the command line.
     for pattern in utils.uniq(settings.ignore):
         for i in range(len(names) - 1, -1, -1):
             if fnmatch(names[i], pattern):
                 # Modify in place!
                 del names[i]
     for name in names:
         if name.endswith('.txt'):
             self.process_txt(directory, name)
Ejemplo n.º 8
0
 def visit(self, recurse, directory, names):
     settings = self.get_settings("", directory)
     if settings.prune and (os.path.abspath(directory) in settings.prune):
         print >>sys.stderr, "/// ...Skipping directory (pruned):", directory
         sys.stderr.flush()
         names[:] = []
         return
     if not self.initial_settings.silent:
         print >>sys.stderr, "/// Processing directory:", directory
         sys.stderr.flush()
     # settings.ignore grows many duplicate entries as we recurse
     # if we add patterns in config files or on the command line.
     for pattern in utils.uniq(settings.ignore):
         for i in range(len(names) - 1, -1, -1):
             if fnmatch(names[i], pattern):
                 # Modify in place!
                 del names[i]
     for name in names:
         if name.endswith(".txt"):
             if self.process_txt(directory, name):
                 break  # prune
     if not recurse:
         del names[:]
def analyzeSummary(groupingType, model="0.1"):
    """
    Make graphs from summary files: input 
    CSV summary structure: 
        "Category","Level","Model","docsInModel","ReturnedDocsForModel","NrInputDocs"
    """
    path = "testData/%s/%s/summary_100.csv" %(groupingType,model)
    labels = []
    categories = uniq(path, 0)
    plt.rc('legend',**{'fontsize':6})
    
    contentSingleDocsInModel = []
    contentSingleReturnedDocs= []
    
    contentRangeDocsInModel= []
    contentRangeReturnedDocs= []
    rangeLabels = []

    #graphs for summary
    for cat in categories: 
        content = []
        #range data
        f = open(path, "rb")
        reader = csv.reader(f)
        contentRange = [row for row in reader if row[0] == cat and row[2].count('_') == 3]
        
        tempRangeDocsInModel = [int(x[3]) for x in contentRange]
        tempRangerangeReturnedDocsForModel = [int(x[4]) for x in contentRange]
        f.close()
        
        #single data
        f = open(path, "rb") # don't forget the 'b'!
        reader = csv.reader(f)
        contentSingle = [row for row in reader if row[0] == cat and row[2].count('_') == 2]
        #values
        singleDocsInModel = [int(x[3]) for x in contentSingle]
        singleReturnedDocsForModel = [int(x[4]) for x in contentSingle]
        
        position = len(singleDocsInModel) + 1
        
        while position < 14:
            singleDocsInModel.insert(position,0)
            singleReturnedDocsForModel.insert(position,0)
            tempRangeDocsInModel.insert(position,0)
            tempRangerangeReturnedDocsForModel.insert(position,0)
            position += 1
        
        
        #single model
        contentSingleDocsInModel.append(singleDocsInModel)
        contentSingleReturnedDocs.append(singleReturnedDocsForModel)
        
        #range model
        contentRangeDocsInModel.append(tempRangeDocsInModel)
        contentRangeReturnedDocs.append(tempRangerangeReturnedDocsForModel)
        f.close()
       
        #labels
        labels.append(cat)
        
    #print contentSingleDocsInModel
    #print contentSingleReturnedDocs
    summaryGraph(contentSingleDocsInModel, contentSingleReturnedDocs,labels, "single",model)
    summaryGraph(contentRangeDocsInModel, contentRangeReturnedDocs,labels, "range", model)