def __init__(self, filename, configurations, delay=False): self.corpus = Corpus() self.coded_configurations = [ Config.code_config(c) for c in configurations ] ## pode ser feito em paralelo train, dev = prep.processKnowledgeBase(filename, validate=True) self.dev = dev self.train = train if not delay: for k, qs in self.train.items(): qs_q = [] for qtext in qs: qformat = {} for cc in self.coded_configurations: qformat[cc] = process_chain(qtext, cc) qs_q.append((qtext, qformat)) self.corpus.add( QA([Question(qtext, qformat) for qtext, qformat in qs_q], Answer(k)))
def loadCorpus(filename: str, configuration): corpus = Corpus() kb = extract(filename, validate=False) Config.preprocessing_method_functions["tfidf"] = TFidf(kb, configuration, runtime=True) for k, qa in kb.items(): qs = qa[0] a = qa[1] questions = [] for qtext in qs: qformat = {configuration: process_chain(qtext, configuration)} questions.append(Question(qtext, qformat)) corpus.add(QA(questions, Answer(k, a))) return corpus
def idfsearchGroup(self, idfconfig, metric_functions, metric_functions_names): self.corpus = Corpus() idfformat = Config.code_config(idfconfig) Config.preprocessing_method_functions["tfidf"] = TFidf( self.train, idfformat) for k, qs in self.train.items(): qs_q = [] # print(qs) for qtext in qs: qformat = {} qformat[idfformat] = process_chain(qtext, idfformat) qs_q.append((qtext, qformat)) self.corpus.add( QA([Question(qtext, qformat) for qtext, qformat in qs_q], Answer(k))) metrics = [] i = 0 for mf in metric_functions: metrics.append( Metric(idfformat, mf, name=metric_functions_names[i])) i += 1 return self._evaluate(self.corpus.queryGroup, metrics)
def process(filestem): new_complex = "%s.relaxed.xml"%filestem new_refined = "%s.refined.xml"%filestem corpus = Corpus() corpus.readFromFile(filestem) #CorpusRefiner(corpus,NOT).resolveAll() #CorpusRefiner(corpus,EQUAL).resolveAll() #CorpusRefiner(corpus,COREFER).resolveAll() #CorpusRefiner(corpus,RELENT).resolveAll() #CorpusRefiner(corpus,MUTUAL).resolveAll() CorpusRefiner(corpus,Nesting).resolveAll() #CorpusRefiner(corpus,Pack).resolveAll() corpus.writeToFile(new_refined)
def initialisation(): global beta, GLOBAL_REF_MODEL dataset_path = '.\\dataset\\*.txt' models = [] for file_name in glob.glob(dataset_path): models.append(Corpus(path_to_corpus=file_name).freq_result) m = Model(*models) beta = m.beta GLOBAL_REF_MODEL = m.global_ref_model
def initialisation(): global beta, GLOBAL_REF_MODEL dataset_path = r'C:\PFE\datasets\v1\*.txt' models = [] for file_name in glob.glob(dataset_path): models.append( Corpus(path_to_corpus=file_name).bi_phonemes_frequencies_list) m = Model(*models) beta = m.beta GLOBAL_REF_MODEL = m.global_reference_model
def post(self): data = Classification.parser.parse_args() speech = data['speech'] with open(file='.\\text.txt', encoding="utf-8") as file: txt = file.read() txt = txt + ' ' + speech + ' ' speaker = Model(Corpus(corpus=txt).freq_result, GLOBAL_REF_MODEL) phi = speaker.beta if phi >= beta: return {'classification': 'Pathological'} else: return {'classification': 'Healthy'}
class Config(object): preprocessing_methods = [ "pontuation", "acents", "numbers", "lowercase", "tokenize", "stopw_nltk", "stopw_minimal", "stem", "tfidf", ] preprocessing_method_functions = { "pontuation": prep.removePunctuation, "acents": unidecode.unidecode, "numbers": prep.removeNumbers, "lowercase": prep.lowerCase, "tokenize": prep.tokenize, "stem": prep.stem, "stopw_nltk": prep.RemoveStopWords(prep.stopwords_list["nltk"]), "stopw_minimal": prep.RemoveStopWords(prep.stopwords_list["minimal"]), "tfidf": None } def __init__(self, filename, configurations, delay=False): self.corpus = Corpus() self.coded_configurations = [ Config.code_config(c) for c in configurations ] ## pode ser feito em paralelo train, dev = prep.processKnowledgeBase(filename, validate=True) self.dev = dev self.train = train if not delay: for k, qs in self.train.items(): qs_q = [] for qtext in qs: qformat = {} for cc in self.coded_configurations: qformat[cc] = process_chain(qtext, cc) qs_q.append((qtext, qformat)) self.corpus.add( QA([Question(qtext, qformat) for qtext, qformat in qs_q], Answer(k))) def search(self, metric_functions, metric_functions_names): return self._search(metric_functions, metric_functions_names, self.corpus.query) def searchGroup(self, metric_functions, metric_functions_names): return self._search(metric_functions, metric_functions_names, self.corpus.queryGroup) def _search(self, metric_functions, metric_functions_names, query_func): metrics = [] i = 0 for mf in metric_functions: for cc in self.coded_configurations: metrics.append(Metric(cc, mf, name=metric_functions_names[i])) i += 1 return self._evaluate(query_func, metrics) #Config.preprocessing_method_functions["tfidf"] = prep.TFidf( #self.train, #idfqformat) def idfsearch(self, idfconfig, metric_functions, metric_functions_names): self.corpus = Corpus() idfformat = Config.code_config(idfconfig) Config.preprocessing_method_functions["tfidf"] = TFidf( self.train, idfformat) for k, qs in self.train.items(): qs_q = [] #print(qs) for qtext in qs: qformat = {} qformat[idfformat] = process_chain(qtext, idfformat) qs_q.append((qtext, qformat)) self.corpus.add( QA([Question(qtext, qformat) for qtext, qformat in qs_q], Answer(k))) metrics = [] i = 0 for mf in metric_functions: metrics.append( Metric(idfformat, mf, name=metric_functions_names[i])) i += 1 return self._evaluate(self.corpus.query, metrics) def idfsearchGroup(self, idfconfig, metric_functions, metric_functions_names): self.corpus = Corpus() idfformat = Config.code_config(idfconfig) Config.preprocessing_method_functions["tfidf"] = TFidf( self.train, idfformat) for k, qs in self.train.items(): qs_q = [] # print(qs) for qtext in qs: qformat = {} qformat[idfformat] = process_chain(qtext, idfformat) qs_q.append((qtext, qformat)) self.corpus.add( QA([Question(qtext, qformat) for qtext, qformat in qs_q], Answer(k))) metrics = [] i = 0 for mf in metric_functions: metrics.append( Metric(idfformat, mf, name=metric_functions_names[i])) i += 1 return self._evaluate(self.corpus.queryGroup, metrics) def evaluate(self, metrics, decide=None): return self._evaluate(self.corpus.query, metrics) def evaluateGroup(self, metrics): return self._evaluate(self.corpus.queryGroup, metrics) def _evaluate(self, query_func, metrics, decide=None): queries = [] for ans_nr, qtext in self.dev.items(): qformat = {} for m in metrics: qformat[m.format] = process_chain(qtext, m.format) queries.append((Query(question=Question(qtext, qformat), metrics=metrics, n=1), ans_nr)) answ = [] mscores = {} s = {} for m in metrics: mscores[m] = [] s[m] = 0 c = 0 #print(len(queries)) #print(len(self.corpus.qa_corpus)) b = True for qq, ans in queries: rs = query_func(qq) #print(rs.rankings) if b: max = {} min = {} for m, hp in rs.rankings: max[m] = 0 min[m] = 1 b = False for m, hp in rs.rankings: mscores[m].append(hp[0].qa.ans.nr) if hp[0].qa.ans.nr == ans: if hp[0].score <= 1: s[m] += 1 #print(hp[0].score) max[m] = max[m] if hp[0].score < max[m] else hp[0].score else: min[m] = min[m] if hp[0].score > min[m] else hp[0].score #print("############") #print(hp[0].score) #print("query:" + qq.question.text) #print("processed" + str(qq.question.format[m.format])) #print("obtained: " + hp[0].q.text) #print("processed" + str(hp[0].q.format[m.format])) answ.append(ans) #for m in max.keys(): # print("max: " + str(max[m]) + " " + m.name) # print("min: " + str(min[m]) + " " + m.name) # print(s[m]/len(queries)) return mscores, answ @staticmethod def code_config(p_dict): code = "" for pp in Config.preprocessing_methods: if pp in p_dict: if p_dict[pp]: code += "1" else: code += "0" else: code += "0" return code @staticmethod def decode_config(code): r_dict = {} for i in range(len(Config.preprocessing_methods)): if int(code[i]) == 1: r_dict[Config.preprocessing_methods[i]] = True return r_dict @staticmethod def loadCorpus(filename: str, configuration): corpus = Corpus() kb = extract(filename, validate=False) Config.preprocessing_method_functions["tfidf"] = TFidf(kb, configuration, runtime=True) for k, qa in kb.items(): qs = qa[0] a = qa[1] questions = [] for qtext in qs: qformat = {configuration: process_chain(qtext, configuration)} questions.append(Question(qtext, qformat)) corpus.add(QA(questions, Answer(k, a))) return corpus