def analyzeCSV(groupingType, model, type=2, analysisType="3",limit = "100"): """ Parameters. model : model to analyize type: type: 1 -> single level sim analysis 2 -> range analysis Get all sim files in model iterate over categories Distinguish between sngle and range sim files Get top n row from sim file, n = numer of items searched for Calculate IR measures """ keys = ['category', 'depth','idLevel','ocID','sim','nrOcc'] path = "testData/%s/%s/sim/SummaryCSV/%s/" %(groupingType,model,limit) categories = getMainCat() for cat in categories: if type==1: sysOutFile = path+cat+".txt" else: sysOutFile = path+cat+"_Range.txt" sys.stdout = open(sysOutFile, 'wb') #get cat debth sqlmaxDepth = "select max(categoryDepth) from dmoz_categories where Topic like 'Top/"+str(cat)+"/%' and filterOut = 0" maxDebthRS = dbQuery(sqlmaxDepth) maxDebth = maxDebthRS[0] maxDebth = maxDebth[0] ranger = [x for x in range(2,maxDebth+1)] print "Category: ", cat for level in ranger: if type == 1: fileName = "%s%s_%s_%s.csv" % (path,model,cat,level) originalIDFile = "%s%s_%s_%s_original.csv" % (path,model,cat,level) elif type == 2: fileName = "%s%s_%s_1_%s.csv" % (path,model,cat,level) originalIDFile = "%s%s_%s_1_%s_original.csv" % (path,model,cat,level) #original categories from sim csv or oid csv if os.path.isfile(originalIDFile): nrUID = uniq(originalIDFile, 0) else: nrUID = uniq(fileName, 2) nrUID = [int(i) for i in nrUID] #print fileName returnedID = uniq(fileName, 3) returnedID = [int(i) for i in returnedID[:len(nrUID)]] print "level: ", level,"\t",fileName #print nrUID #print returnedID print len(nrUID) print len(returnedID) if len(nrUID) ==len(returnedID): print sklearn.metrics.classification_report(nrUID,returnedID)
def visit(self, recurse, directory, names): settings = self.get_settings('', directory) if settings.prune and (os.path.abspath(directory) in settings.prune): print >> sys.stderr, '/// ...Skipping directory (pruned):', directory sys.stderr.flush() names[:] = [] return if not self.initial_settings.silent: print >> sys.stderr, '/// Processing directory:', directory sys.stderr.flush() # settings.ignore grows many duplicate entries as we recurse # if we add patterns in config files or on the command line. for pattern in utils.uniq(settings.ignore): for i in range(len(names) - 1, -1, -1): if fnmatch(names[i], pattern): # Modify in place! del names[i] prune = 0 for name in names: if name.endswith('.txt'): prune = self.process_txt(directory, name) if prune: break if not recurse: del names[:]
def visit(self, directory, names): # BUG prune and ignore do not work settings = self.get_settings('', directory) errout = ErrorOutput(encoding=settings.error_encoding) if settings.prune and (os.path.abspath(directory) in settings.prune): print >>errout, ('/// ...Skipping directory (pruned): %s' % directory) sys.stderr.flush() names[:] = [] return if not self.initial_settings.silent: print >>errout, '/// Processing directory: %s' % directory sys.stderr.flush() # settings.ignore grows many duplicate entries as we recurse # if we add patterns in config files or on the command line. for pattern in utils.uniq(settings.ignore): for i in range(len(names) - 1, -1, -1): if fnmatch(names[i], pattern): # Modify in place! del names[i] prune = 0 for name in names: if name.endswith('.txt'): prune = self.process_txt(directory, name) if prune: break
def indirect_target_error(self, target, explanation): naming = '' reflist = [] if target['names']: naming = '"%s" ' % target['names'][0] for name in target['names']: reflist.extend(self.document.refnames.get(name, [])) for id in target['ids']: reflist.extend(self.document.refids.get(id, [])) naming += '(id="%s")' % target['ids'][0] msg = self.document.reporter.error( 'Indirect hyperlink target %s refers to target "%s", %s.' % (naming, target['refname'], explanation), base_node=target) msgid = self.document.set_id(msg) for ref in utils.uniq(reflist): prb = nodes.problematic(ref.rawsource, ref.rawsource, refid=msgid) prbid = self.document.set_id(prb) msg.add_backref(prbid) ref.replace_self(prb) target.resolved = 1
def indirect_target_error(self, target, explanation): naming = '' reflist = [] if target['names']: naming = '"%s" ' % target['names'][0] for name in target['names']: reflist.extend(self.document.refnames.get(name, [])) for id in target['ids']: reflist.extend(self.document.refids.get(id, [])) naming += '(id="%s")' % target['ids'][0] msg = self.document.reporter.error( 'Indirect hyperlink target %s refers to target "%s", %s.' % (naming, target['refname'], explanation), base_node=target) msgid = self.document.set_id(msg) for ref in utils.uniq(reflist): prb = nodes.problematic( ref.rawsource, ref.rawsource, refid=msgid) prbid = self.document.set_id(prb) msg.add_backref(prbid) ref.replace_self(prb) target.resolved = 1
def visit(self, directory, names, subdirectories): settings = self.get_settings("", directory) errout = ErrorOutput(encoding=settings.error_encoding) if settings.prune and (os.path.abspath(directory) in settings.prune): errout.write("/// ...Skipping directory (pruned): %s\n" % directory) sys.stderr.flush() del subdirectories[:] return if not self.initial_settings.silent: errout.write("/// Processing directory: %s" % directory) sys.stderr.flush() # settings.ignore grows many duplicate entries as we recurse # if we add patterns in config files or on the command line. for pattern in utils.uniq(settings.ignore): for i in range(len(names) - 1, -1, -1): if fnmatch(names[i], pattern): # Modify in place! del names[i] for name in names: if name.endswith(".txt"): self.process_txt(directory, name)
def visit(self, directory, names, subdirectories): settings = self.get_settings('', directory) errout = ErrorOutput(encoding=settings.error_encoding) if settings.prune and (os.path.abspath(directory) in settings.prune): errout.write('/// ...Skipping directory (pruned): %s\n' % directory) sys.stderr.flush() del subdirectories[:] return if not self.initial_settings.silent: errout.write('/// Processing directory: %s\n' % directory) sys.stderr.flush() # settings.ignore grows many duplicate entries as we recurse # if we add patterns in config files or on the command line. for pattern in utils.uniq(settings.ignore): for i in range(len(names) - 1, -1, -1): if fnmatch(names[i], pattern): # Modify in place! del names[i] for name in names: if name.endswith('.txt'): self.process_txt(directory, name)
def visit(self, recurse, directory, names): settings = self.get_settings("", directory) if settings.prune and (os.path.abspath(directory) in settings.prune): print >>sys.stderr, "/// ...Skipping directory (pruned):", directory sys.stderr.flush() names[:] = [] return if not self.initial_settings.silent: print >>sys.stderr, "/// Processing directory:", directory sys.stderr.flush() # settings.ignore grows many duplicate entries as we recurse # if we add patterns in config files or on the command line. for pattern in utils.uniq(settings.ignore): for i in range(len(names) - 1, -1, -1): if fnmatch(names[i], pattern): # Modify in place! del names[i] for name in names: if name.endswith(".txt"): if self.process_txt(directory, name): break # prune if not recurse: del names[:]
def analyzeSummary(groupingType, model="0.1"): """ Make graphs from summary files: input CSV summary structure: "Category","Level","Model","docsInModel","ReturnedDocsForModel","NrInputDocs" """ path = "testData/%s/%s/summary_100.csv" %(groupingType,model) labels = [] categories = uniq(path, 0) plt.rc('legend',**{'fontsize':6}) contentSingleDocsInModel = [] contentSingleReturnedDocs= [] contentRangeDocsInModel= [] contentRangeReturnedDocs= [] rangeLabels = [] #graphs for summary for cat in categories: content = [] #range data f = open(path, "rb") reader = csv.reader(f) contentRange = [row for row in reader if row[0] == cat and row[2].count('_') == 3] tempRangeDocsInModel = [int(x[3]) for x in contentRange] tempRangerangeReturnedDocsForModel = [int(x[4]) for x in contentRange] f.close() #single data f = open(path, "rb") # don't forget the 'b'! reader = csv.reader(f) contentSingle = [row for row in reader if row[0] == cat and row[2].count('_') == 2] #values singleDocsInModel = [int(x[3]) for x in contentSingle] singleReturnedDocsForModel = [int(x[4]) for x in contentSingle] position = len(singleDocsInModel) + 1 while position < 14: singleDocsInModel.insert(position,0) singleReturnedDocsForModel.insert(position,0) tempRangeDocsInModel.insert(position,0) tempRangerangeReturnedDocsForModel.insert(position,0) position += 1 #single model contentSingleDocsInModel.append(singleDocsInModel) contentSingleReturnedDocs.append(singleReturnedDocsForModel) #range model contentRangeDocsInModel.append(tempRangeDocsInModel) contentRangeReturnedDocs.append(tempRangerangeReturnedDocsForModel) f.close() #labels labels.append(cat) #print contentSingleDocsInModel #print contentSingleReturnedDocs summaryGraph(contentSingleDocsInModel, contentSingleReturnedDocs,labels, "single",model) summaryGraph(contentRangeDocsInModel, contentRangeReturnedDocs,labels, "range", model)