Beispiel #1
0
    def to_dot_str_graph(self):
        lines = [u'digraph finite_state_machine {', '\tdpi=100;']
        node_lines = []
        for node in self.G.nodes():
            d_node = Machine.d_clean(node)
            d_node_id = d_node.replace('=', '_')
            if "_" in d_node_id:
                d_node = d_node_id.split('_')[-2]
            else:
                d_node = d_node_id
            node_lines.append(u'\t{0} [shape = circle, label = "{1}"];'.format(
                d_node_id, d_node).replace('-', '_'))
        lines += sorted(node_lines)

        edge_lines = []
        for u, v, edata in self.G.edges(data=True):
            d_u = Machine.d_clean(u)
            d_v = Machine.d_clean(v)
            d_u = d_u.replace('=', '_')
            d_v = d_v.replace('=', '_')
            edge_lines.append(u'\t{0} -> {1} [ label = "{2}" ];'.format(
                d_u, d_v, edata['color']))

        lines += sorted(edge_lines)
        lines.append('}')
        return u'\n'.join(lines)
Beispiel #2
0
 def parse(self, sentence):
     """
     input sentence is a list of tokens and chunk in the following format:
         [(token1_tag1, token1_tag2, token1_tagX, ...),
          ([(tokeninchunk1_tag1, tokeninchunk1_tagX,...),
            ...
           ], case_of_chunk),
          (token_out_of_chunk_again_tagX,...),
          ...
         ]
     output is a list of machines
     """
     machines = []
     for token_or_chunk in sentence:
         # chunk or token?
         if type(token_or_chunk[0]) == list:
             # chunk
             chunk, _ = token_or_chunk
             machines.append([
                 Machine(analysis.split("/")[0], Control(analysis))
                 for _, analysis in chunk
             ])
         else:
             # token
             token = token_or_chunk  #[0]
             print 'token:', token
             _, analysis = token
             machines.append(
                 [Machine(analysis.split("/")[0], Control(analysis))])
     return machines
Beispiel #3
0
    def to_dot_str_graph(self):
        lines = [u'digraph finite_state_machine {', '\tdpi=100;']
        node_lines = []
        for node in self.G.nodes():
            d_node = Machine.d_clean(node)
            d_node_id = d_node.replace('=', '_')
            if "_" in d_node_id:
                d_node = d_node_id.split('_')[-2]
            else:
                d_node= d_node_id
            node_lines.append(u'\t{0} [shape = circle, label = "{1}"];'.format(
                d_node_id, d_node).replace('-', '_'))
        lines += sorted(node_lines)

        edge_lines = []
        for u,v, edata in self.G.edges(data=True):
            d_u = Machine.d_clean(u)
            d_v = Machine.d_clean(v)
            d_u = d_u.replace('=', '_')
            d_v = d_v.replace('=', '_')
            edge_lines.append(
                u'\t{0} -> {1} [ label = "{2}" ];'.format(
                    d_u, d_v,edata['color']))

        lines += sorted(edge_lines)
        lines.append('}')
        return u'\n'.join(lines)
Beispiel #4
0
 def act(self, seq):
     # logging.info(
     #    "appending machines {0} and {1} to new binary {2}".format(
     #        seq[self.first_pos], seq[self.second_pos], self.bin_rel))
     rel_machine = Machine(self.bin_rel)
     rel_machine.append(seq[self.first_pos], 1)
     rel_machine.append(seq[self.second_pos], 2)
     return [rel_machine]
Beispiel #5
0
    def get_machine(self,
                    printname,
                    new_machine=False,
                    allow_new_base=False,
                    allow_new_ext=False,
                    allow_new_oov=True):
        """returns the lowest level (base < ext < oov) existing machine
        for the printname. If none exist, creates a new machine in the lowest
        level allowed by the allow_* flags. Will always create new machines
        for uppercase printnames"""

        # returns a new machine without adding it to any lexicon
        if new_machine:
            return Machine(printname, ConceptControl())

        # TODO
        if not printname:
            return self.get_machine("_empty_")

        if printname.isupper():
            # return self.get_machine(printname, new_machine=True)
            return self.get_machine(printname=printname.lower(),
                                    new_machine=new_machine,
                                    allow_new_base=allow_new_base,
                                    allow_new_ext=allow_new_ext,
                                    allow_new_oov=allow_new_oov)

        machines = self.lexicon.get(
            printname,
            self.ext_lexicon.get(printname,
                                 self.oov_lexicon.get(printname, set())))
        if len(machines) == 0:
            # logging.info(
            #    u'creating new machine for unknown word: "{0}"'.format(
            #        printname))
            new_machine = Machine(printname, ConceptControl())
            if allow_new_base:
                self.add(printname, new_machine, external=False)
            elif allow_new_ext:
                self.add(printname, new_machine)
            elif allow_new_oov:
                self.add(printname, new_machine, oov=True)
            else:
                return None

            return self.get_machine(printname)
        else:
            if len(machines) > 1:
                debug_str = u'ambiguous printname: {0}, machines: {1}'.format(
                    printname, [
                        lex.get(printname, set([]))
                        for lex in (self.lexicon, self.ext_lexicon,
                                    self.oov_lexicon)
                    ])
                raise Exception(debug_str)

            return next(iter(machines))
Beispiel #6
0
    def create_from_dumps(machines_dump, ext_machines_dump, primitives, cfg):
        """builds the lexicon from dumps created by Lexicon.dump_machines"""
        lexicon = Lexicon(cfg)
        lexicon.primitives = primitives
        for word, dumped_def_graph in machines_dump.iteritems():
            new_machine = Machine(word, ConceptControl())
            lexicon.add_def_graph(word, new_machine, dumped_def_graph)
            lexicon.add(word, new_machine, external=False)

        for word, dumped_def_graph in ext_machines_dump.iteritems():
            new_machine = Machine(word, ConceptControl())
            lexicon.add_def_graph(word, new_machine, dumped_def_graph)
            lexicon.add(word, new_machine, external=True)

        return lexicon
Beispiel #7
0
    def activate(self):
        """Finds and returns the machines that should be activated by the
        machines already active. These machines are automatically added
        to self.active as well

        When exactly a machine should be activated is still up for
        consideration; however, currently this method returns a machine if
        all non-primitive machines on its partitions are active."""
        activated = []

        for printname, static_machines in self.static.iteritems():
            for static_machine in static_machines:
                if printname in self.active:
                    continue
                has_machine = False
                for machine in chain(*static_machine.partitions):
                    has_machine = True
                    if (not unicode(machine).startswith(u'#')
                            and unicode(machine) not in self.active):
                        break
                else:
                    if has_machine:
                        m = Machine(printname,
                                    copy.copy(static_machine.control))
                        self.add_active(m)
                        activated.append(m)
        return activated
Beispiel #8
0
 def draw_single_graph(self, word, path):
     clean_word = Machine.d_clean(word)
     for c, machine in enumerate(self.definitions[word]):
         graph = MachineGraph.create_from_machines([machine])
         file_name = os.path.join(path, '{0}_{1}.dot'.format(clean_word, c))
         with open(file_name, 'w') as file_obj:
             file_obj.write(graph.to_dot().encode('utf-8'))
Beispiel #9
0
 def draw_single_graph(self, word, path):
     clean_word = Machine.d_clean(word)
     for c, machine in enumerate(self.definitions[word]):
         graph = MachineGraph.create_from_machines([machine])
         file_name = os.path.join(path, '{0}_{1}.dot'.format(clean_word, c))
         with open(file_name, 'w') as file_obj:
             file_obj.write(graph.to_dot().encode('utf-8'))
Beispiel #10
0
 def __build_definition_graph(self, root_def_m, static_m, def_graph, stop,
                              canonicals, deep_cases):
     """
     Walks through the machines reachable from @p static_m, and adds a
     reference to the corresponding canonical machines to the definition
     graph node (@p root_def_m).
     """
     for static_child in static_m.children():
         if not static_child.fancy():
             cname = self.get_static_machine(
                 static_child.printname())[0].printname()
             def_child = def_graph[cname][0]
             if def_child != root_def_m:
                 root_def_m.append(def_child)
         elif (deep_cases and static_child.deep_case()
               and static_child.printname() not in stop):
             root_def_m.append(Machine(static_child.printname()))
         if static_child.fancy() or static_child not in canonicals:
             # deep cases are added by their printname to stop, because as
             # of yet, hash is id-based for machines
             if (static_child not in stop
                     and static_child.printname() not in stop):
                 if static_child.fancy():
                     stop.add(static_child.printname())
                 else:
                     stop.add(static_child)
                 self.__build_definition_graph(root_def_m, static_child,
                                               def_graph, stop, canonicals,
                                               deep_cases)
Beispiel #11
0
def test():
    #a = Machine("the", PosControl("DET"))
    #kek = Machine("kek", PosControl("ADJ"))
    #kockat = Machine("kockat", PosControl("NOUN<CAS<ACC>>"))
    m = Machine("vonat")
    m2 = Machine("tb")
    m.append(m2)
    m2.append(m)
    m3 = copy(m)
    assert m3
Beispiel #12
0
    def create_machine(self, name, partitions):
        # lists are accepted because of ["=", "AGT"]
        if type(name) is list:
            name = "".join(name)

        # HACK until we find a good solution for defaults
        name = name.strip('<>')

        is_plur = name in self.plur_dict
        if is_plur:
            name = self.plur_dict[name]

        m = Machine(decode_from_proszeky(name),
                    ConceptControl(), partitions)
        if is_plur:
            m.append(self.create_machine('more', 1), 0)

        return m
Beispiel #13
0
    def create_machine(self, name, partitions):
        # lists are accepted because of ["=", "AGT"]
        if type(name) is list:
            name = "".join(name)

        # HACK until we find a good solution for defaults
        name = name.strip('<>')

        is_plur = name in self.plur_dict
        if is_plur:
            name = self.plur_dict[name]

        m = Machine(decode_from_proszeky(name),
                    ConceptControl(), partitions)
        if is_plur:
            m.append(self.create_machine('more', 1), 0)

        return m
Beispiel #14
0
 def draw_word_graphs(self):
     ensure_dir('graphs/words')
     for c, (word, machines) in enumerate(self.definitions.iteritems()):
         if c % 1000 == 0:
             logging.info("{0}...".format(c))
         for i, machine in enumerate(machines):
             graph = MachineGraph.create_from_machines([machine])
             clean_word = Machine.d_clean(word)
             if clean_word[0] == 'X':
                 clean_word = clean_word[1:]
             f = open('graphs/words/{0}_{1}.dot'.format(clean_word, i), 'w')
             f.write(graph.to_dot().encode('utf-8'))
Beispiel #15
0
 def draw_word_graphs(self):
     ensure_dir('graphs/words')
     for c, (word, machines) in enumerate(self.definitions.iteritems()):
         if c % 1000 == 0:
             logging.info("{0}...".format(c))
         for i, machine in enumerate(machines):
             graph = MachineGraph.create_from_machines([machine])
             clean_word = Machine.d_clean(word)
             if clean_word[0] == 'X':
                 clean_word = clean_word[1:]
             f = open('graphs/words/{0}_{1}.dot'.format(clean_word, i), 'w')
             f.write(graph.to_dot().encode('utf-8'))
Beispiel #16
0
    def to_dot(self):
        lines = [u'digraph finite_state_machine {', '\tdpi=100;']
        # lines.append('\tordering=out;')
        # sorting everything to make the process deterministic
        node_lines = []
        for node, n_data in self.G.nodes(data=True):
            d_node = Machine.d_clean(node)
            printname = Machine.d_clean('_'.join(d_node.split('_')[:-1]))
            if 'expanded' in n_data and not n_data['expanded']:
                node_line = u'\t{0} [shape = circle, label = "{1}", style="filled"];'.format(  # nopep8
                    d_node, printname).replace('-', '_')
            else:
                node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                    d_node, printname).replace('-', '_')
            node_lines.append(node_line)
        lines += sorted(node_lines)

        edge_lines = []
        for u, v, edata in self.G.edges(data=True):
            if 'color' in edata:
                d_node1 = Machine.d_clean(u)
                d_node2 = Machine.d_clean(v)
                edge_lines.append(u'\t{0} -> {1} [ label = "{2}" ];'.format(
                    Machine.d_clean(d_node1), Machine.d_clean(d_node2),
                    edata['color']))

        lines += sorted(edge_lines)
        lines.append('}')
        return u'\n'.join(lines)
Beispiel #17
0
    def to_dot(self):
        lines = [u'digraph finite_state_machine {', '\tdpi=100;']
        # lines.append('\tordering=out;')
        # sorting everything to make the process deterministic
        node_lines = []
        for node, n_data in self.G.nodes(data=True):
            d_node = Machine.d_clean(node)
            printname = Machine.d_clean('_'.join(d_node.split('_')[:-1]))
            if 'expanded' in n_data and not n_data['expanded']:
                node_line = u'\t{0} [shape = circle, label = "{1}", style="filled"];'.format(  # nopep8
                    d_node, printname).replace('-', '_')
            else:
                node_line = u'\t{0} [shape = circle, label = "{1}"];'.format(
                    d_node, printname).replace('-', '_')
            node_lines.append(node_line)
        lines += sorted(node_lines)

        edge_lines = []
        for u,v,edata in self.G.edges(data=True):
            if 'color' in edata:
                d_node1 = Machine.d_clean(u)
                d_node2 = Machine.d_clean(v)
                edge_lines.append(
                    u'\t{0} -> {1} [ label = "{2}" ];'.format(
                        Machine.d_clean(d_node1), Machine.d_clean(d_node2),edata['color']))

        lines += sorted(edge_lines)
        lines.append('}')
        return u'\n'.join(lines)
Beispiel #18
0
    def get_machine(self, printname, second=False):
        if printname == 'have':
            logging.debug('have is changed to HAS')
            #logging.info('interpreting a form of "have" as "HAS"')
            return self.get_machine("HAS")

        if printname in self.active:
            return self.active[printname].keys()[0]

        cands = self.get_static_machine(printname)
        if not cands:
            if second:
                raise Exception(
                    "no machine with printname {0}".format(printname) +
                    "even after calling add_static for {0}".format(
                        Machine(printname, ConceptControl())))
            #logging.warning(
            #"creating new machine for '{0}'".format(printname))
            self.add_static(Machine(printname, ConceptControl()))
            return self.get_machine(printname, second=True)  # sanity check

        return cands[0]
Beispiel #19
0
 def act(self, seq):
     # logging.info(
     #    "appending machines {0} and {1} to new binary {2}".format(
     #        seq[self.first_pos], seq[self.second_pos], self.bin_rel))
     rel_machine = Machine(self.bin_rel)
     rel_machine.append(seq[self.first_pos], 1)
     rel_machine.append(seq[self.second_pos], 2)
     return [rel_machine]
Beispiel #20
0
 def to_dot(self):
     lines = [u'digraph finite_state_machine {', '\tdpi=100;']
     # lines.append('\tordering=out;')
     # sorting everything to make the process deterministic
     node_lines = []
     for node in self.G.nodes_iter():
         d_node = Machine.d_clean(node)
         printname = Machine.d_clean(d_node.split('_')[0])
         node_lines.append(u'\t{0} [shape = circle, label = "{1}"];'.format(
             d_node, printname).replace('-', '_'))
     lines += sorted(node_lines)
     edge_lines = []
     for node1, adjacency in self.G.adjacency_iter():
         d_node1 = Machine.d_clean(node1)
         for node2, edges in adjacency.iteritems():
             d_node2 = Machine.d_clean(node2)
             for i, attributes in edges.iteritems():
                 edge_lines.append(
                     u'\t{0} -> {1} [ label = "{2}" ];'.format(
                         d_node1.replace('-', '_'),
                         d_node2.replace('-', '_'), attributes['color']))
     lines += sorted(edge_lines)
     lines.append('}')
     return u'\n'.join(lines)
Beispiel #21
0
def test():
    #a = Machine("the", PosControl("DET"))
    #kek = Machine("kek", PosControl("ADJ"))
    #kockat = Machine("kockat", PosControl("NOUN<CAS<ACC>>"))
    m = Machine("vonat")
    m2 = Machine("tb")
    m.append(m2)
    m2.append(m)
    m3 = copy(m)
    assert m3
Beispiel #22
0
    def get_dep_definition(self, word, deps):
        root_deps = filter(lambda d: d[0] == 'root', deps)
        if len(root_deps) != 1:
            logging.warning(
                u'no unique root dependency, skipping word "{0}"'.format(word))
            return None
        root_word, root_id = root_deps[0][2]
        root_lemma = self.lemmatizer.lemmatize(root_word).replace('/', '_PER_')
        root_lemma = root_word if not root_lemma else root_lemma

        word2machine = self.get_machines_from_parsed_deps(deps)

        root_machine = word2machine[root_lemma]
        word_machine = word2machine.get(word, Machine(word, ConceptControl()))
        word_machine.append(root_machine, 0)
        return word_machine
Beispiel #23
0
 def __init__(self, name, lexicon, supp_dict, max_depth=3):
     self.name = name
     self.lexicon = lexicon
     self.supp_dict = supp_dict
     self.max_depth = max_depth
     self.matchers = {}
     self.working_area = [Machine(None, KRPosControl('stem/VERB'))]
     # indexing 0th element in static because that is the canonical machine
     self.discover_arguments(lexicon.static[name][0])
     control = self.generate_control()
     self.case_pattern = re.compile("N(OUN|P)[^C]*CAS<([^>]*)>")
     Construction.__init__(self, name, control)
     self.activated = False
     logging.info('VerbConstruction {0} created. Matchers: {1}'.format(
         self.name, self.matchers))
     logging.info('Control: {0}'.format(self.control))
     f = open('control.dot', 'w')
     f.write(self.control.to_dot())
Beispiel #24
0
def dep_to_dot(deps, fn):
    try:
        edges = [(d['dep']['lemma'], d['type'], d['gov']['lemma'])
                 for d in deps if d['type'] not in EXCLUDE]
    except:
        edges = [(d[1][0], d[0], d[2][0]) for d in deps if d[0] not in EXCLUDE]
    words = set([e[0] for e in edges] + [e[2] for e in edges])
    lines = []
    for word in words:
        lines.append(u'\t{0} [shape=rectangle, label="{0}"];'.format(
            Machine.d_clean(word)))
    for edge in edges:
        dep, dtype, gov = map(Machine.d_clean, edge)
        lines.append(u'\t{0} -> {1} [label="{2}"];'.format(dep, gov, dtype))
    with open(fn, 'w') as f:
        f.write(HEADER.encode("utf-8"))
        f.write(u"\n".join(lines).encode("utf-8"))
        f.write("}\n")
Beispiel #25
0
def dep_to_dot(deps, fn):
    try:
        edges = [
            (d['dep']['lemma'], d['type'], d['gov']['lemma']) for d in deps
            if d['type'] not in EXCLUDE]
    except:
        edges = [(d[1][0], d[0], d[2][0]) for d in deps if d[0] not in EXCLUDE]
    words = set([e[0] for e in edges] + [e[2] for e in edges])
    lines = []
    for word in words:
        lines.append(u'\t{0} [shape=rectangle, label="{0}"];'.format(
            Machine.d_clean(word)))
    for edge in edges:
        dep, dtype, gov = map(Machine.d_clean, edge)
        lines.append(u'\t{0} -> {1} [label="{2}"];'.format(dep, gov, dtype))
    with open(fn, 'w') as f:
        f.write(HEADER.encode("utf-8"))
        f.write(u"\n".join(lines).encode("utf-8"))
        f.write("}\n")
Beispiel #26
0
def dep_to_dot(deps):
    if isinstance(deps[0], dict):
        # new dep structure
        edges = [(d['dep']['lemma'], d['type'], d['gov']['lemma'])
                 for d in deps if d['type'] not in EXCLUDE]
    else:
        # old dep structure
        edges = [(d[1][0], d[0], d[2][0]) for d in deps if d[0] not in EXCLUDE]

    words = set([e[0] for e in edges] + [e[2] for e in edges])
    lines = []
    for word in words:
        lines.append(u'\t{0} [shape=rectangle, label="{0}"];'.format(
            Machine.d_clean(word)))
    for edge in edges:
        dep, dtype, gov = map(Machine.d_clean, edge)
        lines.append(u'\t{0} -> {1} [label="{2}"];'.format(dep, gov, dtype))

    dot_str = HEADER.encode("utf-8")
    dot_str += u"\n".join(lines).encode("utf-8")
    dot_str += "}\n"
    return dot_str
Beispiel #27
0
def dep_to_dot(deps):
    if isinstance(deps[0], dict):
        # new dep structure
        edges = [
            (d['dep']['lemma'], d['type'], d['gov']['lemma']) for d in deps
            if d['type'] not in EXCLUDE]
    else:
        # old dep structure
        edges = [(d[1][0], d[0], d[2][0]) for d in deps if d[0] not in EXCLUDE]

    words = set([e[0] for e in edges] + [e[2] for e in edges])
    lines = []
    for word in words:
        lines.append(u'\t{0} [shape=rectangle, label="{0}"];'.format(
            Machine.d_clean(word)))
    for edge in edges:
        dep, dtype, gov = map(Machine.d_clean, edge)
        lines.append(u'\t{0} -> {1} [label="{2}"];'.format(dep, gov, dtype))

    dot_str = HEADER.encode("utf-8")
    dot_str += u"\n".join(lines).encode("utf-8")
    dot_str += "}\n"
    return dot_str
Beispiel #28
0
    def extract_definition_graph(self, deep_cases=False):
        """
        Extracts the definition graph from the static graph. The former is a
        "flattened" version of the latter: all canonical words in the
        definition are connected to the definiendum, as well as the canonical
        version of non-canonical terms. The structure of the definition is not
        preserved.

        @param deep_cases if @c False (the default), deep cases in the
                          definitions do not appear on the output graph.
        """
        def_graph = {}
        canonicals = set(l[0] for l in self.static.values())
        for name in self.static.keys():
            def_graph[name] = [Machine(name)]
        for name, static_machines in self.static.iteritems():
            #print "I am at machine", name
            static_machine = static_machines[0]
            if not static_machine.fancy():
                def_machine = def_graph[name][0]
                self.__build_definition_graph(def_machine, static_machine,
                                              def_graph, set([]), canonicals,
                                              deep_cases)
        return def_graph
Beispiel #29
0
    def unify_recursively(self,
                          static_machine,
                          zeros_only,
                          first=False,
                          stop=None):
        """Returns the active machine that corresponds to @p static_machine. It
        recursively unifies all machines in all partitions of @p static_machine
        with machines in the active set. @p static_machine may be either a
        machine or a string.
        @param stop the set of machines already unified."""
        if stop is None:
            stop = set()

        if unicode(static_machine) == u'IS_A':
            return None
        # If we have already unified this machine: just return
        if (not isinstance(static_machine, str)
                and not isinstance(static_machine, unicode)):
            static_printname = static_machine.printname()
        else:
            static_printname = static_machine
        if static_printname in stop:
            #logging.debug('ur stops')
            return self.active[static_printname].keys()[0]
        #If static_machine is a string, we don't have much to do
        #logging.debug('ur static_machine {0}, type: {1}'.format(
        #   str(static_machine), str(type(static_machine))))
        if isinstance(static_machine, str):
            if static_machine in self.active:
                # FIXME: [0] is a hack, fix it
                #logging.debug('ur str in active')
                return self.active[static_machine].keys()[0]
            else:
                if static_machine.startswith('#'):
                    #logging.debug('ur waking up')
                    self.wake_avm_construction(static_machine)
                    return None
                #logging.debug('ur activating str')
                active_machine = Machine(static_machine, ConceptControl())
                self.__add_active_machine(active_machine)
                return active_machine
        # If it's a machine, we create the corresponding active one
        elif isinstance(static_machine, Machine):
            static_name = static_machine.printname()
            #logging.debug('Does {0} start with #? {1}'.format(
            #   static_name, static_name.startswith('#')))

            if static_name in self.active:
                #logging.debug('ur machine in active')
                active_machine = self.active[static_name].keys()[0]
            else:
                #logging.debug('Not in active')
                if static_name.startswith('#'):
                    #logging.debug('ur waking up')
                    self.wake_avm_construction(static_name)
                    return None
                #logging.debug('ur activating machine')
                active_machine = Machine(static_name)
                active_control = copy.copy(static_machine.control)
                #active_control = copy.deepcopy(static_machine.control)
                #deepcopy causes infinite recursion, I hope shallow copy
                #works, since the active machine will update the control's
                #machine attribute (and we don't know of anything else)
                active_machine.set_control(active_control)
                self.__add_active_machine(active_machine)

            stop.add(static_name)

            # Now we have to walk through the tree recursively
            for i, part in enumerate(static_machine.partitions):
                for ss_machine in part:
                    as_machine = self.unify_recursively(ss_machine,
                                                        zeros_only,
                                                        first=False,
                                                        stop=stop)
                    if as_machine is not None:
                        #logging.info('adding {} to part {} of {}'.format(
                        #    as_machine, i, active_machine))
                        active_machine.append(as_machine, i)
            return active_machine
        else:
            raise TypeError('static_machine must be a Machine or a str')
Beispiel #30
0
    def __add_static_recursive(self, curr_from, replacement=None):
        if replacement is None:
            replacement = {}
        #print "Processing word", curr_from
        #sys.stdout.flush()

        if curr_from not in replacement:
            # Deep cases are not canonized
            if curr_from.deep_case():
                replacement[curr_from] = curr_from
            else:
                """
                try:
                    if curr_from.printname().isupper():
                        curr_from.printname_ = curr_from.printname().lower()
                except AttributeError, e:
                    logging.info('curr_from: {0}, type: {1}'.format(
                        curr_from, type(curr_from)))
                    raise Exception(e)
                """
                #print "Not in replacement"
                # Does this machine appear in the static tree?
                from_already_seen = self.__get_disambig_incomplete(
                    curr_from.printname())
                #print ("from already seen", curr_from.printname(),
                #       from_already_seen
                # If not: simply adding the new machine/definition...
                if len(from_already_seen) == 0:
                    #print "from already seen = 0"
                    # This is the definition word, or no children: accept as
                    # canonical / placeholder
                    if len(curr_from.children()) == 0 or len(replacement) == 0:
                        #print "adding as canoncical"
                        from_already_seen = [curr_from]
                    # Otherwise add a placeholder + itself to static
                    else:
                        #print "adding as placeholder"
                        from_already_seen = [
                            Machine(curr_from.printname()), curr_from
                        ]

                    self.static[curr_from.printname()] = from_already_seen
                    #print ("Adding to static", curr_from.printname(),
                    #       from_already_seen)
                    self.__add_to_disambig(curr_from.printname())
                    replacement[curr_from] = curr_from

#                    print self.static, self.static_disambig

                else:
                    #print "in static", from_already_seen
                    # Definitions: the word is the canonical one, regardless of
                    # the number of children
                    if len(replacement) == 0:
                        #print "definition"
                        canonical = from_already_seen[0]
                        canonical.printname_ = curr_from.printname()
                        canonical.control = curr_from.control
                        replacement[curr_from] = canonical
                    # Handling non-definition words
                    else:
                        #print "not definition"
                        canonical = from_already_seen[0]
                        # No children: replace with the canonical
                        if len(curr_from.children()) == 0:
                            #print "no children"
                            replacement[curr_from] = canonical
                        # Otherwise: add the new machine to static, and keep it
                        else:
                            #print "children"
                            replacement[curr_from] = curr_from
                            from_already_seen.append(curr_from)

            # Copying the children...
            curr_to = replacement[curr_from]
            from_partitions = [[m for m in p] for p in curr_from.partitions]
            for part_i, part in enumerate(from_partitions):
                for child in part:
                    #print "found child", child
                    #Remove to delete any parent links
                    #print "part before", part, curr_from.partitions[part_i]
                    curr_from.remove(child, part_i)
                    #print "part after", part, curr_from.partitions[part_i]
                    curr_to.append(
                        self.__add_static_recursive(child, replacement),
                        part_i)

        return replacement[curr_from]
Beispiel #31
0
 def get_new_machine(self, printname):
     """returns a new machine without adding it to any lexicon"""
     return Machine(printname, ConceptControl())
Beispiel #32
0
    def unify_recursively(self, static_machine, zeros_only, first=False,
                          stop=None):
        """Returns the active machine that corresponds to @p static_machine. It
        recursively unifies all machines in all partitions of @p static_machine
        with machines in the active set. @p static_machine may be either a
        machine or a string.
        @param stop the set of machines already unified."""
        if stop is None:
            stop = set()

        if unicode(static_machine) == u'IS_A':
            return None
        # If we have already unified this machine: just return
        if (not isinstance(static_machine, str) and
                not isinstance(static_machine, unicode)):
            static_printname = static_machine.printname()
        else:
            static_printname = static_machine
        if static_printname in stop:
            #logging.debug('ur stops')
            return self.active[static_printname].keys()[0]
        #If static_machine is a string, we don't have much to do
        #logging.debug('ur static_machine {0}, type: {1}'.format(
        #   str(static_machine), str(type(static_machine))))
        if isinstance(static_machine, str):
            if static_machine in self.active:
                # FIXME: [0] is a hack, fix it
                #logging.debug('ur str in active')
                return self.active[static_machine].keys()[0]
            else:
                if static_machine.startswith('#'):
                    #logging.debug('ur waking up')
                    self.wake_avm_construction(static_machine)
                    return None
                #logging.debug('ur activating str')
                active_machine = Machine(static_machine, ConceptControl())
                self.__add_active_machine(active_machine)
                return active_machine
        # If it's a machine, we create the corresponding active one
        elif isinstance(static_machine, Machine):
            static_name = static_machine.printname()
            #logging.debug('Does {0} start with #? {1}'.format(
            #   static_name, static_name.startswith('#')))

            if static_name in self.active:
                #logging.debug('ur machine in active')
                active_machine = self.active[static_name].keys()[0]
            else:
                #logging.debug('Not in active')
                if static_name.startswith('#'):
                    #logging.debug('ur waking up')
                    self.wake_avm_construction(static_name)
                    return None
                #logging.debug('ur activating machine')
                active_machine = Machine(static_name)
                active_control = copy.copy(static_machine.control)
                #active_control = copy.deepcopy(static_machine.control)
                #deepcopy causes infinite recursion, I hope shallow copy
                #works, since the active machine will update the control's
                #machine attribute (and we don't know of anything else)
                active_machine.set_control(active_control)
                self.__add_active_machine(active_machine)

            stop.add(static_name)

            # Now we have to walk through the tree recursively
            for i, part in enumerate(static_machine.partitions):
                for ss_machine in part:
                    as_machine = self.unify_recursively(
                        ss_machine, zeros_only, first=False, stop=stop)
                    if as_machine is not None:
                        #logging.info('adding {} to part {} of {}'.format(
                        #    as_machine, i, active_machine))
                        active_machine.append(as_machine, i)
            return active_machine
        else:
            raise TypeError('static_machine must be a Machine or a str')
Beispiel #33
0
                # logging.warning('duplicate pn: {0}, machines: {1}, {2}'.format(
                #    pn, d[pn], "{0}:{1}".format(m, m.partitions)))
            d[m.printname()].add(m)
            logging.debug('\n'+m.to_debug_str())
        except pyparsing.ParseException, pe:
            print l
            logging.error("Error: "+str(pe))
    return d

def read_plur(_file):
    plur_dict = {}
    for line in _file:
        plur, sg = line.split()
        plur_dict[plur] = sg
    return plur_dict

if __name__ == "__main__":
    logging.basicConfig(level=logging.WARNING,
                        format="%(asctime)s : %(module)s (%(lineno)s) " +
                        "- %(levelname)s - %(message)s")
    plur_dict = read_plur(open('/home/recski/projects/4lang/4lang.plural'))
    dp = DefinitionParser(plur_dict)
    pstr = sys.argv[-1]
    if sys.argv[1] == "-d":
        print Machine.to_debug_str(dp.parse_into_machines(pstr), max_depth=99)
    elif sys.argv[1] == "-f":
        lexicon = read(file(sys.argv[2]), '../../res/4lang/4lang.plural',
                       three_parts=True)
    else:
        print dp.parse(pstr)
Beispiel #34
0
                # logging.warning('duplicate pn: {0}, machines: {1}, {2}'.format(
                #    pn, d[pn], "{0}:{1}".format(m, m.partitions)))
            d[m.printname()].add(m)
            logging.debug('\n'+m.to_debug_str())
        except pyparsing.ParseException, pe:
            print l
            logging.error("Error: "+str(pe))
    return d

def read_plur(_file):
    plur_dict = {}
    for line in _file:
        plur, sg = line.split()
        plur_dict[plur] = sg
    return plur_dict

if __name__ == "__main__":
    plural_f = '../../4lang/4lang.plural'
    logging.basicConfig(level=logging.WARNING,
                        format="%(asctime)s : %(module)s (%(lineno)s) " +
                        "- %(levelname)s - %(message)s")
    plur_dict = read_plur(open(plural_f))
    dp = DefinitionParser(plur_dict)
    pstr = sys.argv[-1]
    if sys.argv[1] == "-d":
        print Machine.to_debug_str(dp.parse_into_machines(pstr), max_depth=99)
    elif sys.argv[1] == "-f":
        lexicon = read(file(sys.argv[2]), plural_f, three_parts=True)
    else:
        print dp.parse(pstr)