def test_Corpus_interacao(self): # teste com description.inf c = Corpus( "/home/iamjabour/workspacePessoal/tese/corpus/machine_learning_tableextraction_paper2/webtablegrnd/html" ) print "path", c.path print "parser", c.parser print "proof", c.proof print "documents len", len(c.documents) x = 0 while True: d = c.getDocument() x += 1 if d == None or x > 0: break print d.path
def __init__(self, corpusPath, configFile=None, extractors=[], limit=int(2**31-1), output=None): if not configFile: if os.path.exists('config_example.cnf'): config = Configurator('config_example.cnf') else: config = Configurator('apps/config_example.cnf') else: config = Configurator(configFile) if output == None: output = sys.stdout self.output = output self.metric = config.metric() self.marker = self.metric.marker() self.corpus = Corpus(corpusPath) self.extractors = extractors self.limit = limit self.benchmark = []
def __init__(self, corpusPath, configFile=None, extractors=[], limit=int(2**31-1), output=None, pfilenames=False, initial = 0): if not configFile: # verifica se existe o arquivo de configuracao padrao e o carrega if os.path.exists('config_example.cnf'): config = Configurator('config_example.cnf') else: config = Configurator('apps/config_example.cnf') else: # carrega o arquivo de configuracao fornecido config = Configurator(configFile) if output == None: # imprime todas as informacoes na saida padrao output = sys.stdout if not limit: limit = int(2**31-1) # roda em todo o corpus # somente imprimir o id do arquivo com seu path caso True self.pfilenames = pfilenames self.output = output # saida padrao onde sera impresso os benchmark self.metric = config.metric() # metrica que sera utilizada self.marker = self.metric.marker() # marcador que sera utilizado self.corpus = Corpus(corpusPath) # carrega o corpus a partir do path if self.corpus == None: return None self.extractors = extractors # extratores que serao utilziados self.limit = limit # configura o limite para o que foi escolhido self.initial = initial # inicializa uma lista vazia onde serao guardados os resultados self.benchmark = []
class Benchmark: def __init__(self, corpusPath, configFile=None, extractors=[], limit=int(2**31-1), output=None, pfilenames=False, initial = 0): if not configFile: # verifica se existe o arquivo de configuracao padrao e o carrega if os.path.exists('config_example.cnf'): config = Configurator('config_example.cnf') else: config = Configurator('apps/config_example.cnf') else: # carrega o arquivo de configuracao fornecido config = Configurator(configFile) if output == None: # imprime todas as informacoes na saida padrao output = sys.stdout if not limit: limit = int(2**31-1) # roda em todo o corpus # somente imprimir o id do arquivo com seu path caso True self.pfilenames = pfilenames self.output = output # saida padrao onde sera impresso os benchmark self.metric = config.metric() # metrica que sera utilizada self.marker = self.metric.marker() # marcador que sera utilizado self.corpus = Corpus(corpusPath) # carrega o corpus a partir do path if self.corpus == None: return None self.extractors = extractors # extratores que serao utilziados self.limit = limit # configura o limite para o que foi escolhido self.initial = initial # inicializa uma lista vazia onde serao guardados os resultados self.benchmark = [] def process(self, debug=False): """ Executa o benchmark, amarzenando os resultados parciais em self.benchmark. para visualizar o resultado de forma adequada utilize pprint() """ count = self.initial while True: count += 1 doc = None if debug: print count, self.limit if self.limit != int(2**31-1) and count <= self.limit: doc = self.corpus.getDocument(id=count) elif self.limit == int(2**31-1): doc = self.corpus.getDocument() else: break if not doc or doc == None: break try: if debug: print doc.id, doc.path except: pass proof = self.corpus.getProof(doc) result = (0.0,0.0) for extractor in self.extractors: if not self.pfilenames: self.marker.reset() extractor.process(doc.content, self.marker) extracted = self.marker.labels result = self.metric.process(extracted, proof) self.benchmark.append(result) def pprint(self): """ Consolida o benchmark e imprime o resultado na saida padrão """ result = {} if len(self.benchmark) == 0: print 'Nao foi gerado nenhum resultado' return {} for key in self.benchmark[0]: result.update({key:[0,0,0]}) for id,doc in enumerate(self.benchmark): print id, doc for key in doc: result[key][0] += doc[key][0] result[key][1] += doc[key][1] result[key][2] += doc[key][2] print print 'total' print result r = {} for k in result: if result[k][1] > 0: if result[k][2] > 0: precision = result[k][0]/float(result[k][2]) else: precision = 1 recall = result[k][0]/float(result[k][1]) if (recall+precision) > 0: f = (2*recall*precision)/(recall+precision) else: f = 0.0 r.update({k: [recall, precision, f]}) else: recall = 1 precision = 1 if result[k][2] == 0 else 0 f = (2*recall*precision)/(recall+precision) r.update({k: [recall, precision, f]}) print print >> self.output, 'key\trecall\tprecision' print >> self.output, r return r
class Benchfile: def __init__(self, corpusPath, configFile=None, extractors=[], limit=int(2**31-1), output=None): if not configFile: if os.path.exists('config_example.cnf'): config = Configurator('config_example.cnf') else: config = Configurator('apps/config_example.cnf') else: config = Configurator(configFile) if output == None: output = sys.stdout self.output = output self.metric = config.metric() self.marker = self.metric.marker() self.corpus = Corpus(corpusPath) self.extractors = extractors self.limit = limit self.benchmark = [] def _process(self, doc): proof = self.corpus.getProof(doc) result = (0.0,0.0) for extractor in self.extractors: extractor.process(doc.content, self.marker) extracted = self.marker.labels result = self.metric.process(extracted, proof) self.benchmark.append(result) def process(self, file=False): """ Executa o benchmark, amarzenando os resultados parciais em self.benchmark. para visualizar o resultado de forma adequada utilize pprint() """ if file: doc = self.corpus.getDocument(file) print "process Document:", doc.path print self._process(doc) return count = 0 while True: count += 1 doc = self.corpus.getDocument() if doc == None or count > self.limit: break self._process(doc) def pprint(self): """ Consolida o benchmark e imprime o resultado na saida padrão """ result = {} for key in self.benchmark[0]: result.update({key:[0,0,0]}) for id,doc in enumerate(self.benchmark): print id, doc for key in doc: result[key][0] += doc[key][0] result[key][1] += doc[key][1] result[key][2] += doc[key][2] print print 'total' print result r = {} for k in result: if result[k][1] > 0: if result[k][2] > 0: precision = result[k][0]/float(result[k][2]) else: precision = 1 recall = result[k][0]/float(result[k][1]) r.update({k: [recall, precision]}) else: r.update({k: [1, 1 if result[k][2] == 0 else 0]}) print print >> self.output, 'key\trecall\tprecision' print >> self.output, r return r