def run(self): logging.info('running QA...') input_file = self.cfg.get('qa', 'input_file') for entry in QAParser.parse_file(input_file): logging.info('processing text...') all_text = "\n".join([doc['text'] for doc in entry['docs']]) model = self.text_to_4lang.process( all_text, dep_dir=self.dep_dir, fn='text') print_text_graph(model, self.graph_dir) model_graph = MachineGraph.create_from_machines(model.values()) for question in entry['questions']: answer = self.answer_question(question, model, model_graph) print answer['text']
def run(self): logging.info('running QA...') input_file = self.cfg.get('qa', 'input_file') for entry in QAParser.parse_file(input_file): logging.info('processing text...') all_text = "\n".join([doc['text'] for doc in entry['docs']]) model = self.text_to_4lang.process(all_text, dep_dir=self.dep_dir, fn='text') print_text_graph(model, self.graph_dir) model_graph = MachineGraph.create_from_machines(model.values()) for question in entry['questions']: answer = self.answer_question(question, model, model_graph) print answer['text']
def process_deps(self, fn, name="none"): sen_machines = [] c = 0 for line in open(fn): data = json.loads(line) deps, corefs = data['deps'], data['corefs'] for sen_deps in deps: # logging.info("processing sentences...") machines = self.dep_to_4lang.get_machines_from_deps_and_corefs( [sen_deps], corefs) print("process_deps") print(machines) if self.expand: if self.abstract: self.dep_to_4lang.lexicon_exp.expand(machines, abstract=True) else: self.dep_to_4lang.lexicon.expand(machines, abstract=False) if self.cfg.getboolean('text', 'expand'): self.dep_to_4lang.lexicon.expand(machines) if self.cfg.getboolean('text', 'print_graphs'): fn = print_text_graph(machines, self.graphs_dir, fn=name) sen_machines.append(machines) c += 1 return sen_machines
def main_sen_sim(cfg): graph_dir = cfg.get("sim", "graph_dir") dep_dir = cfg.get("sim", "deps_dir") ensure_dir(graph_dir) ensure_dir(dep_dir) text_to_4lang = TextTo4lang(cfg) for i, line in enumerate(sys.stdin): preprocessed_line = line.decode("utf-8").strip().lower() sen1, sen2 = preprocessed_line.split("\t") machines1 = text_to_4lang.process(sen1, dep_dir=dep_dir, fn="{0}a".format(i)) machines2 = text_to_4lang.process(sen2, dep_dir=dep_dir, fn="{0}b".format(i)) print_text_graph(machines1, graph_dir, fn="{0}a".format(i)) print_text_graph(machines2, graph_dir, fn="{0}b".format(i)) graph1, graph2 = map(MachineGraph.create_from_machines, (machines1.values(), machines2.values())) print GraphSimilarity.graph_similarity(graph1, graph2)
def answer_question(self, question, model, model_graph): logging.info('processing question: {0}...'.format(question['text'])) question['machines'] = self.text_to_4lang.process( question['text'], dep_dir=self.dep_dir, fn="q{0}".format(question['id'])) for answer in question['answers']: logging.info('processing answer: {0}...'.format(answer['text'])) answer['machines'] = self.text_to_4lang.process( answer['text'], dep_dir=self.dep_dir, fn="q{0}a{1}".format(question['id'], answer['id'])) print_text_graph( answer['machines'], self.graph_dir, fn="q{0}a{1}".format( question['id'], answer['id'])) self.score_answer(answer, model, model_graph) logging.info('score: {0}, evidence: {1}'.format( answer['score'], answer['evidence'])) top_answer = sorted(question['answers'], key=lambda a: -a['score'])[0] return top_answer
def main_sen_sim(cfg): graph_dir = cfg.get("sim", "graph_dir") dep_dir = cfg.get("sim", "deps_dir") ensure_dir(graph_dir) ensure_dir(dep_dir) text_to_4lang = TextTo4lang(cfg) for i, line in enumerate(sys.stdin): preprocessed_line = line.decode('utf-8').strip().lower() sen1, sen2 = preprocessed_line.split('\t') machines1 = text_to_4lang.process( sen1, dep_dir=dep_dir, fn="{0}a".format(i)) machines2 = text_to_4lang.process( sen2, dep_dir=dep_dir, fn="{0}b".format(i)) print_text_graph(machines1, graph_dir, fn="{0}a".format(i)) print_text_graph(machines2, graph_dir, fn="{0}b".format(i)) graph1, graph2 = map( MachineGraph.create_from_machines, (machines1.values(), machines2.values())) print GraphSimilarity.graph_similarity(graph1, graph2)
def answer_question(self, question, model, model_graph): logging.info('processing question: {0}...'.format(question['text'])) question['machines'] = self.text_to_4lang.process(question['text'], dep_dir=self.dep_dir, fn="q{0}".format( question['id'])) for answer in question['answers']: logging.info('processing answer: {0}...'.format(answer['text'])) answer['machines'] = self.text_to_4lang.process( answer['text'], dep_dir=self.dep_dir, fn="q{0}a{1}".format(question['id'], answer['id'])) print_text_graph(answer['machines'], self.graph_dir, fn="q{0}a{1}".format(question['id'], answer['id'])) self.score_answer(answer, model, model_graph) logging.info('score: {0}, evidence: {1}'.format( answer['score'], answer['evidence'])) top_answer = sorted(question['answers'], key=lambda a: -a['score'])[0] return top_answer
def process_phrase(self, phrase): preproc_sens = [] preproc_sens.append(TextTo4lang.preprocess_text( phrase.strip().decode('utf-8'))) deps, corefs, _ = self.parser_wrapper.parse_text("\n".join(preproc_sens)) machine = self.dep_to_4lang.get_machines_from_deps_and_corefs( [deps[0]], corefs) if self.cfg.getboolean('text', 'expand'): self.dep_to_4lang.lexicon.expand(machine) file_name = phrase.replace(' ', '_') file_name = file_name.replace('.', '') if self.cfg.getboolean('text', 'print_graphs'): fn = print_text_graph(machine, self.graphs_dir, fn=file_name) return machine
def process_phrase(self, phrase): preproc_sens = [] preproc_sens.append( TextTo4lang.preprocess_text(phrase.strip().decode('utf-8'))) deps, corefs, _ = self.parser_wrapper.parse_text( "\n".join(preproc_sens)) machine = self.dep_to_4lang.get_machines_from_deps_and_corefs( [deps[0]], corefs) if self.cfg.getboolean('text', 'expand'): self.dep_to_4lang.lexicon.expand(machine) file_name = phrase.replace(' ', '_') file_name = file_name.replace('.', '') if self.cfg.getboolean('text', 'print_graphs'): fn = print_text_graph(machine, self.graphs_dir, fn=file_name) return machine
def main(): logging.basicConfig( level=__LOGLEVEL__, format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") cfg_file = sys.argv[1] if len(sys.argv) > 1 else None max_sens = int(sys.argv[2]) if len(sys.argv) > 2 else None cfg = get_cfg(cfg_file) text_to_4lang = TextTo4lang(cfg) input_fn = cfg.get('data', 'input_sens') sens = [line.strip() for line in open(input_fn)] if max_sens is not None: sens = sens[:max_sens] words_to_machines = text_to_4lang.process(sens, print_deps=True) fn = print_text_graph(words_to_machines, cfg.get('machine', 'graph_dir')) logging.info('wrote graph to {0}'.format(fn))
def main(): logging.basicConfig( level=__LOGLEVEL__, format="%(asctime)s : " + "%(module)s (%(lineno)s) - %(levelname)s - %(message)s") cfg_file = sys.argv[1] if len(sys.argv) > 1 else None max_sens = int(sys.argv[2]) if len(sys.argv) > 2 else None cfg = get_cfg(cfg_file) text_to_4lang = TextTo4lang(cfg) input_fn = cfg.get('data', 'input_sens') sens = [line.strip() for line in open(input_fn)] if max_sens is not None: sens = sens[:max_sens] words_to_machines = text_to_4lang.process(sens, print_deps=True) fn = print_text_graph(words_to_machines, cfg.get('machine', 'graph_dir')) logging.info('wrote graph to {0}'.format(fn))
def process_deps(self, fn): sen_machines = [] c = 0 for line in open(fn): data = json.loads(line) deps, corefs = data['deps'], data['corefs'] for sen_deps in deps: # logging.info("processing sentences...") machines = self.dep_to_4lang.get_machines_from_deps_and_corefs( [sen_deps], corefs) if self.cfg.getboolean('text', 'expand'): self.dep_to_4lang.lexicon.expand(machines) if self.cfg.getboolean('text', 'print_graphs'): fn = print_text_graph(machines, self.graphs_dir, fn=c) sen_machines.append(machines) c += 1 return sen_machines