def process_entries(self, words): entry_preprocessor = EntryPreprocessor(self.cfg) entries = map(entry_preprocessor.preprocess_entry, (self.raw_dict[word] for word in words)) if self.lang == 'eng': stanford_wrapper = StanfordWrapper(self.cfg) entries = stanford_wrapper.parse_sentences(entries, definitions=True) elif self.lang == 'hun': magyarlanc_wrapper = Magyarlanc(self.cfg) entries = magyarlanc_wrapper.parse_entries(entries) else: print 'incorrect lang' for entry in entries: if entry['to_filter']: continue word = entry['hw'] for sense in entry['senses']: definition = sense['definition'] if definition is None: continue if word in self.dictionary: logging.warning( "entries with identical headwords:\n{0}\n{1}".format( entry, self.dictionary[word])) self.unify(self.dictionary[word], entry) else: self.dictionary[word] = entry
def process_entries(self, words): entry_preprocessor = EntryPreprocessor(self.cfg) entries = map(entry_preprocessor.preprocess_entry, (self.raw_dict[word] for word in words)) if self.lang == 'eng': stanford_wrapper = StanfordWrapper(self.cfg) entries = stanford_wrapper.parse_sentences( entries, definitions=True) elif self.lang == 'hun': magyarlanc_wrapper = Magyarlanc(self.cfg) entries = magyarlanc_wrapper.parse_entries(entries) else: print 'incorrect lang' for entry in entries: if entry['to_filter']: continue word = entry['hw'] for sense in entry['senses']: definition = sense['definition'] if definition is None: continue if word in self.dictionary: logging.warning( "entries with identical headwords:\n{0}\n{1}".format( entry, self.dictionary[word])) self.unify(self.dictionary[word], entry) else: self.dictionary[word] = entry
def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.deps_dir = self.cfg.get('text', 'deps_dir') # self.machines_dir = self.cfg.get('text', 'machines_dir') self.graphs_dir = cfg.get('text', 'graph_dir') map(ensure_dir, (self.deps_dir, self.graphs_dir)) # self.machines_dir if self.lang == 'en': self.parser_wrapper = CoreNLPWrapper(self.cfg) elif self.lang == 'hu': self.parser_wrapper = Magyarlanc(self.cfg) self.dep_to_4lang = DepTo4lang(self.cfg)
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if not direct_parse: self.deps_dir = self.cfg.get('text', 'deps_dir') ensure_dir(self.deps_dir) # self.machines_dir = self.cfg.get('text', 'machines_dir') self.graphs_dir = cfg.get('text', 'graph_dir') ensure_dir(self.graphs_dir) if self.lang == 'en': self.parser_wrapper = CoreNLPWrapper(self.cfg) elif self.lang == 'hu': self.parser_wrapper = Magyarlanc(self.cfg) self.dep_to_4lang = DepTo4lang(self.cfg, direct_parse)
class TextTo4lang(): square_regex = re.compile("\[.*?\]") def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.deps_dir = self.cfg.get('text', 'deps_dir') # self.machines_dir = self.cfg.get('text', 'machines_dir') self.graphs_dir = cfg.get('text', 'graph_dir') map(ensure_dir, (self.deps_dir, self.graphs_dir)) # self.machines_dir if self.lang == 'en': self.parser_wrapper = CoreNLPWrapper(self.cfg) elif self.lang == 'hu': self.parser_wrapper = Magyarlanc(self.cfg) self.dep_to_4lang = DepTo4lang(self.cfg) @staticmethod def preprocess_text(text): t = text.strip() t = TextTo4lang.square_regex.sub('', t) t = t.replace(u"=", u"_eq_") t = t.replace(u"\xc2\xa0", u" ") t = t.replace(u"\xa0", u" ") t = t.strip() # if t != text: # logging.debug(u"{0} -> {1}".format(text, t)) return t def print_deps(self, parsed_sens, dep_dir=None, fn=None): for i, deps in enumerate(parsed_sens): if fn is None: out_fn = os.path.join(dep_dir, "{0}.dep".format(i)) else: out_fn = os.path.join(dep_dir, "{0}_{1}.dep".format(fn, i)) with open(out_fn, 'w') as f: f.write( "\n".join(["{0}({1}, {2})".format(*dep) for dep in deps])) def process(self): input_path = self.cfg.get('text', 'input_sens') if os.path.isdir(input_path): file_names = [ os.path.join(input_path, fn) for fn in os.listdir(input_path)] else: file_names = [input_path] logging.info('will process {0} file(s)'.format(len(file_names))) map(self.process_file, file_names) def process_file(self, fn): base_fn = os.path.basename(fn) deps_fn = os.path.join(self.deps_dir, "{0}.deps".format(base_fn)) # machines_fn = os.path.join( # self.machines_dir, "{0}.machines".format(base_fn)) if not os.path.exists(deps_fn): self.parse_file(fn, deps_fn) else: logging.info('file exists: {0}, not parsing'.format(deps_fn)) # TODO also support dumping machines to file # logging.getLogger().setLevel(__MACHINE_LOGLEVEL__) if not self.cfg.getboolean('text', 'parse_only'): self.process_deps(deps_fn) def parse_file(self, fn, out_fn): logging.info("parsing file: {0}".format(fn)) preproc_sens = [] for line in open(fn): if not line: continue preproc_sens.append(TextTo4lang.preprocess_text( line.strip().decode('utf-8'))) deps, corefs = self.parser_wrapper.parse_text("\n".join(preproc_sens)) with open(out_fn, 'w') as out_f: out_f.write("{0}\n".format(json.dumps({ "deps": deps, "corefs": corefs}))) logging.info("parsed {0} sentences".format(len(deps))) def process_deps(self, fn): sen_machines = [] c = 0 for line in open(fn): data = json.loads(line) deps, corefs = data['deps'], data['corefs'] for sen_deps in deps: # logging.info("processing sentences...") machines = self.dep_to_4lang.get_machines_from_deps_and_corefs( [sen_deps], corefs) if self.cfg.getboolean('text', 'expand'): self.dep_to_4lang.lexicon.expand(machines) if self.cfg.getboolean('text', 'print_graphs'): fn = print_text_graph(machines, self.graphs_dir, fn=c) sen_machines.append(machines) c += 1 return sen_machines @staticmethod def delete_connection(m1, m2): for part in range(len(m1.partitions)): if m2 in m1.partitions[part]: m1.remove(m2, part) return part # ipdb.set_trace() return None
class TextTo4lang(): square_regex = re.compile("\[.*?\]") def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if not direct_parse: self.deps_dir = self.cfg.get('text', 'deps_dir') ensure_dir(self.deps_dir) # self.machines_dir = self.cfg.get('text', 'machines_dir') self.graphs_dir = cfg.get('text', 'graph_dir') ensure_dir(self.graphs_dir) if self.lang == 'en': self.parser_wrapper = CoreNLPWrapper(self.cfg) elif self.lang == 'hu': self.parser_wrapper = Magyarlanc(self.cfg) self.dep_to_4lang = DepTo4lang(self.cfg, direct_parse) @staticmethod def preprocess_text(text): t = text.strip() t = TextTo4lang.square_regex.sub('', t) t = t.replace(u"=", u"_eq_") t = t.replace(u"\xc2\xa0", u" ") t = t.replace(u"\xa0", u" ") t = t.strip() # if t != text: # logging.debug(u"{0} -> {1}".format(text, t)) return t def print_deps(self, parsed_sens, dep_dir=None, fn=None): for i, deps in enumerate(parsed_sens): if fn is None: out_fn = os.path.join(dep_dir, "{0}.dep".format(i)) else: out_fn = os.path.join(dep_dir, "{0}_{1}.dep".format(fn, i)) with open(out_fn, 'w') as f: f.write("\n".join( ["{0}({1}, {2})".format(*dep) for dep in deps])) def process(self): input_path = self.cfg.get('text', 'input_sens') if os.path.isdir(input_path): file_names = [ os.path.join(input_path, fn) for fn in os.listdir(input_path) ] else: file_names = [input_path] logging.info('will process {0} file(s)'.format(len(file_names))) map(self.process_file, file_names) def process_file(self, fn): base_fn = os.path.basename(fn) deps_fn = os.path.join(self.deps_dir, "{0}.deps".format(base_fn)) # machines_fn = os.path.join( # self.machines_dir, "{0}.machines".format(base_fn)) if not os.path.exists(deps_fn): self.parse_file(fn, deps_fn) else: logging.info('file exists: {0}, not parsing'.format(deps_fn)) # TODO also support dumping machines to file # logging.getLogger().setLevel(__MACHINE_LOGLEVEL__) if not self.cfg.getboolean('text', 'parse_only'): self.process_deps(deps_fn) def parse_file(self, fn, out_fn): logging.info("parsing file: {0}".format(fn)) preproc_sens = [] for line in open(fn): if not line: continue preproc_sens.append( TextTo4lang.preprocess_text(line.strip().decode('utf-8'))) deps, corefs, parse_trees = self.parser_wrapper.parse_text( "\n".join(preproc_sens)) parse_tree_fn = out_fn.split('.')[0] + '_parse_trees.txt' with open(parse_tree_fn, 'w') as out_f: for sen, parse_tree in zip(preproc_sens, parse_trees): out_f.write(u"{0}\t{1}\n".format(sen, parse_tree).encode('utf-8')) with open(out_fn, 'w') as out_f: out_f.write("{0}\n".format( json.dumps({ "deps": deps, "corefs": corefs }))) logging.info("parsed {0} sentences".format(len(deps))) def process_deps(self, fn): sen_machines = [] c = 0 for line in open(fn): data = json.loads(line) deps, corefs = data['deps'], data['corefs'] for sen_deps in deps: # logging.info("processing sentences...") machines = self.dep_to_4lang.get_machines_from_deps_and_corefs( [sen_deps], corefs) if self.cfg.getboolean('text', 'expand'): self.dep_to_4lang.lexicon.expand(machines) if self.cfg.getboolean('text', 'print_graphs'): fn = print_text_graph(machines, self.graphs_dir, fn=c) sen_machines.append(machines) c += 1 return sen_machines @staticmethod def delete_connection(m1, m2): for part in range(len(m1.partitions)): if m2 in m1.partitions[part]: m1.remove(m2, part) return part # ipdb.set_trace() return None def process_phrase(self, phrase): preproc_sens = [] preproc_sens.append( TextTo4lang.preprocess_text(phrase.strip().decode('utf-8'))) deps, corefs, _ = self.parser_wrapper.parse_text( "\n".join(preproc_sens)) machine = self.dep_to_4lang.get_machines_from_deps_and_corefs( [deps[0]], corefs) if self.cfg.getboolean('text', 'expand'): self.dep_to_4lang.lexicon.expand(machine) file_name = phrase.replace(' ', '_') file_name = file_name.replace('.', '') if self.cfg.getboolean('text', 'print_graphs'): fn = print_text_graph(machine, self.graphs_dir, fn=file_name) return machine