def reset_lexicon(self, load_from=None, save_to=None): if load_from: self.lexicon = cPickle.load(open(load_from)) else: self.lexicon = Lexicon() self.__add_definitions() self.__add_constructions() if save_to: cPickle.dump(self.lexicon, open(save_to, 'w'))
def get_machines_from_deps_and_corefs(self, dep_lists, corefs): coref_index = defaultdict(dict) for (word, sen_no), mentions in corefs: for m_word, m_sen_no in mentions: coref_index[m_word][m_sen_no - 1] = word # logging.info('coref index: {0}'.format(coref_index)) lexicon = Lexicon() word2machine = {} for i, deps in enumerate(dep_lists): try: for dep, (word1, id1), (word2, id2) in deps: # logging.info('w1: {0}, w2: {1}'.format(word1, word2)) c_word1 = coref_index[word1].get(i, word1) c_word2 = coref_index[word2].get(i, word2) """ if c_word1 != word1: logging.warning( "unifying '{0}' with canonical '{1}'".format( word1, c_word1)) if c_word2 != word2: logging.warning( "unifying '{0}' with canonical '{1}'".format( word2, c_word2)) """ # logging.info( # 'cw1: {0}, cw2: {1}'.format(c_word1, c_word2)) lemma1 = self.lemmatizer.lemmatize(c_word1) lemma2 = self.lemmatizer.lemmatize(c_word2) lemma1 = c_word1 if not lemma1 else lemma1 lemma2 = c_word2 if not lemma2 else lemma2 # TODO lemma1 = lemma1.replace('/', '_PER_') lemma2 = lemma2.replace('/', '_PER_') # logging.info( # 'lemma1: {0}, lemma2: {1}'.format(lemma1, lemma2)) machine1, machine2 = self._add_dependency( dep, (lemma1, id1), (lemma2, id2), lexicon) word2machine[lemma1] = machine1 word2machine[lemma2] = machine2 except: logging.error("failure on dep: {0}({1}, {2})".format( dep, word1, word2)) traceback.print_exc() raise Exception("adding dependencies failed") return word2machine
class Wrapper: num_re = re.compile(r'^[0-9.,]+$', re.UNICODE) def __init__(self, cfg, batch=False, include_ext=True): self.cfg = cfg self.__read_config() self.batch = batch self.wordlist = set() self.__read_definitions() if include_ext: self.get_ext_definitions() self.__read_supp_dict() self.reset_lexicon() def reset_lexicon(self, load_from=None, save_to=None): if load_from: self.lexicon = cPickle.load(open(load_from)) else: self.lexicon = Lexicon() self.__add_definitions() self.__add_constructions() if save_to: cPickle.dump(self.lexicon, open(save_to, 'w')) def __read_config(self): items = dict(self.cfg.items("machine")) self.def_files = [(s.split(":")[0].strip(), int(s.split(":")[1])) for s in items["definitions"].split(",")] self.dep_map_fn = items.get("dep_map") self.tok2lemma_fn = items.get("tok2lemma") self.ext_defs_path = items.get("ext_definitions") self.supp_dict_fn = items.get("supp_dict") self.plural_fn = items.get("plurals") def __read_definitions(self): self.definitions = {} for file_name, printname_index in self.def_files: # TODO HACK makefile needed if (file_name.endswith("generated") and not os.path.exists(file_name)): raise Exception( "A definition file that should be generated" + " by pymachine/scripts/generate_translation_dict.sh" + " does not exist: {0}".format(file_name)) if file_name.endswith('pickle'): logging.info( 'loading 4lang definitions from {}...'.format(file_name)) definitions = cPickle.load(file(file_name)) else: logging.info('parsing 4lang definitions...') definitions = read_defs(file(file_name), self.plural_fn, printname_index, three_parts=True) logging.info('dumping 4lang definitions to file...') f = open('{0}.pickle'.format(file_name), 'w') cPickle.dump(definitions, f) for pn, machines in definitions.iteritems(): if pn not in self.definitions: self.definitions[pn] = machines else: self.definitions[pn] |= machines def __add_definitions(self): definitions = deepcopy(self.definitions) self.lexicon.add_static(definitions.itervalues()) self.lexicon.finalize_static() def __read_supp_dict(self): self.supp_dict = sdreader(file( self.supp_dict_fn)) if self.supp_dict_fn else {} def __add_constructions(self): for construction in np_grammar.np_rules: self.lexicon.add_construction(construction) # add_verb_constructions(self.lexicon, self.supp_dict) # add_avm_constructions(self.lexicon, self.supp_dict) def get_ext_definitions(self): if self.ext_defs_path.endswith('pickle'): logging.info('loading external definitions from {}...'.format( self.ext_defs_path)) definitions = cPickle.load(file(self.ext_defs_path)) else: raise Exception("building machines from deps has moved to 4lang") for word, machine in definitions.iteritems(): if word not in self.definitions: self.definitions[word] = set([machine]) logging.info('done') def draw_single_graph(self, word, path): clean_word = Machine.d_clean(word) for c, machine in enumerate(self.definitions[word]): graph = MachineGraph.create_from_machines([machine]) file_name = os.path.join(path, '{0}_{1}.dot'.format(clean_word, c)) with open(file_name, 'w') as file_obj: file_obj.write(graph.to_dot().encode('utf-8')) def draw_word_graphs(self): ensure_dir('graphs/words') for c, (word, machines) in enumerate(self.definitions.iteritems()): if c % 1000 == 0: logging.info("{0}...".format(c)) for i, machine in enumerate(machines): graph = MachineGraph.create_from_machines([machine]) clean_word = Machine.d_clean(word) if clean_word[0] == 'X': clean_word = clean_word[1:] f = open('graphs/words/{0}_{1}.dot'.format(clean_word, i), 'w') f.write(graph.to_dot().encode('utf-8')) def get_def_words(self, stream): for headword, machines in self.definitions.iteritems(): if headword[0] == '@': continue for machine in machines: def_words = [ word for word in MachineTraverser.get_nodes(machine) if word[0] not in '=@' ] stream.write(u"{0}\t{1}\n".format( headword, u"\t".join(def_words)).encode("utf-8")) def run(self, sentence): """Parses a sentence, runs the spreading activation and returns the messages that have to be sent to the active plugins.""" try: sp = SentenceParser() sa = SpreadingActivation(self.lexicon) machines = sp.parse(sentence) logging.debug('machines: {}'.format(machines)) logging.debug('machines: {}'.format([m for m in machines])) for machine_list in machines: for machine in machine_list: if machine.control.kr['CAT'] == 'VERB': logging.debug( 'adding verb construction for {}'.format(machine)) self.lexicon.add_construction( VerbConstruction(machine.printname(), self.lexicon, self.supp_dict)) logging.info('constructions: {}'.format( self.lexicon.constructions)) # results is a list of (url, data) tuples results = sa.activation_loop(machines) print 'results:', results print 'machines:', machines graph = MachineGraph.create_from_machines([m[0] for m in machines], max_depth=1) f = open('machines.dot', 'w') f.write(graph.to_dot().encode('utf-8')) self.lexicon.clear_active() except Exception, e: import traceback traceback.print_exc(e) raise (e) return results
class Wrapper: num_re = re.compile(r'^[0-9.,]+$', re.UNICODE) def __init__(self, cfg, batch=False, include_ext=True): self.cfg = cfg self.__read_config() self.batch = batch self.wordlist = set() self.__read_definitions() if include_ext: self.get_ext_definitions() self.__read_supp_dict() self.reset_lexicon() def reset_lexicon(self, load_from=None, save_to=None): if load_from: self.lexicon = cPickle.load(open(load_from)) else: self.lexicon = Lexicon() self.__add_definitions() self.__add_constructions() if save_to: cPickle.dump(self.lexicon, open(save_to, 'w')) def __read_config(self): items = dict(self.cfg.items("machine")) self.def_files = [(s.split(":")[0].strip(), int(s.split(":")[1])) for s in items["definitions"].split(",")] self.dep_map_fn = items.get("dep_map") self.tok2lemma_fn = items.get("tok2lemma") self.ext_defs_path = items.get("ext_definitions") self.supp_dict_fn = items.get("supp_dict") self.plural_fn = items.get("plurals") def __read_definitions(self): self.definitions = {} for file_name, printname_index in self.def_files: # TODO HACK makefile needed if (file_name.endswith("generated") and not os.path.exists(file_name)): raise Exception( "A definition file that should be generated" + " by pymachine/scripts/generate_translation_dict.sh" + " does not exist: {0}".format(file_name)) if file_name.endswith('pickle'): logging.info( 'loading 4lang definitions from {}...'.format(file_name)) definitions = cPickle.load(file(file_name)) else: logging.info('parsing 4lang definitions...') definitions = read_defs( file(file_name), self.plural_fn, printname_index, three_parts=True) logging.info('dumping 4lang definitions to file...') f = open('{0}.pickle'.format(file_name), 'w') cPickle.dump(definitions, f) for pn, machines in definitions.iteritems(): if pn not in self.definitions: self.definitions[pn] = machines else: self.definitions[pn] |= machines def __add_definitions(self): definitions = deepcopy(self.definitions) self.lexicon.add_static(definitions.itervalues()) self.lexicon.finalize_static() def __read_supp_dict(self): self.supp_dict = sdreader( file(self.supp_dict_fn)) if self.supp_dict_fn else {} def __add_constructions(self): for construction in np_grammar.np_rules: self.lexicon.add_construction(construction) # add_verb_constructions(self.lexicon, self.supp_dict) # add_avm_constructions(self.lexicon, self.supp_dict) def get_ext_definitions(self): if self.ext_defs_path.endswith('pickle'): logging.info( 'loading external definitions from {}...'.format( self.ext_defs_path)) definitions = cPickle.load(file(self.ext_defs_path)) else: raise Exception("building machines from deps has moved to 4lang") for word, machine in definitions.iteritems(): if word not in self.definitions: self.definitions[word] = set([machine]) logging.info('done') def draw_single_graph(self, word, path): clean_word = Machine.d_clean(word) for c, machine in enumerate(self.definitions[word]): graph = MachineGraph.create_from_machines([machine]) file_name = os.path.join(path, '{0}_{1}.dot'.format(clean_word, c)) with open(file_name, 'w') as file_obj: file_obj.write(graph.to_dot().encode('utf-8')) def draw_word_graphs(self): ensure_dir('graphs/words') for c, (word, machines) in enumerate(self.definitions.iteritems()): if c % 1000 == 0: logging.info("{0}...".format(c)) for i, machine in enumerate(machines): graph = MachineGraph.create_from_machines([machine]) clean_word = Machine.d_clean(word) if clean_word[0] == 'X': clean_word = clean_word[1:] f = open('graphs/words/{0}_{1}.dot'.format(clean_word, i), 'w') f.write(graph.to_dot().encode('utf-8')) def get_def_words(self, stream): for headword, machines in self.definitions.iteritems(): if headword[0] == '@': continue for machine in machines: def_words = [ word for word in MachineTraverser.get_nodes(machine) if word[0] not in '=@'] stream.write( u"{0}\t{1}\n".format( headword, u"\t".join(def_words)).encode("utf-8")) def run(self, sentence): """Parses a sentence, runs the spreading activation and returns the messages that have to be sent to the active plugins.""" try: sp = SentenceParser() sa = SpreadingActivation(self.lexicon) machines = sp.parse(sentence) logging.debug('machines: {}'.format(machines)) logging.debug('machines: {}'.format( [m for m in machines])) for machine_list in machines: for machine in machine_list: if machine.control.kr['CAT'] == 'VERB': logging.debug('adding verb construction for {}'.format( machine)) self.lexicon.add_construction(VerbConstruction( machine.printname(), self.lexicon, self.supp_dict)) logging.info('constructions: {}'.format( self.lexicon.constructions)) # results is a list of (url, data) tuples results = sa.activation_loop(machines) print 'results:', results print 'machines:', machines graph = MachineGraph.create_from_machines( [m[0] for m in machines], max_depth=1) f = open('machines.dot', 'w') f.write(graph.to_dot().encode('utf-8')) self.lexicon.clear_active() except Exception, e: import traceback traceback.print_exc(e) raise(e) return results