Beispiel #1
0
    def __init__(self, nombre=None, alias=(), args=()):
        """
        Crea un nuevo `` AnalizadorDeContexto llamado ``nombre``, 
        con ``alias``.

        ``nombre`` es opcional y debería ser una cadena si se proporciona.
        Se usa para diferenciar los objetos AnalizadorDeContexto, y para
        usarlos en un Analizador al determinar qué porción de entrada podría
        pertenecer a un AnalizadorDeContexto dado.

        ``alias`` también es opcional y debería ser un iterable que contenga
        cadenas. El análisis respetará cualquier alias cuando intente 
        "encontrar" un contexto dado en su entrada.

        Puede dar uno o más ``args``, que es una alternativa rápida a llamar a
        ``para arg en args: self.agregar_arg (arg)`` después de la inicialización.

        """
        self.args = Lexicon()
        self.args_posicionales = []
        self.banderas = Lexicon()
        self.banderas_inversas = {}  # No need for Lexicon here
        self.nombre = nombre
        self.alias = alias
        for arg in args:
            self.agregar_arg(arg)
Beispiel #2
0
    def __init__(self, name=None, aliases=(), args=(), vararg=None):
        """
        Create a new ``ParserContext`` named ``name``, with ``aliases``.

        ``name`` is optional, and should be a string if given. It's used to
        tell ParserContext objects apart, and for use in a Parser when
        determining what chunk of input might belong to a given ParserContext.

        ``aliases`` is also optional and should be an iterable containing
        strings. Parsing will honor any aliases when trying to "find" a given
        context in its input.

        May give one or more ``args``, which is a quick alternative to calling
        ``for arg in args: self.add_arg(arg)`` after initialization.
        """
        self.args = Lexicon()
        self.positional_args = []
        self.flags = Lexicon()
        self.inverse_flags = {}  # No need for Lexicon here
        self.name = name
        self.aliases = aliases
        for arg in args:
            self.add_arg(arg)
        if vararg:
            self.varargs = []
        else:
            self.varargs = None
Beispiel #3
0
def main(args):
    lexicon = Lexicon()
    lexicon.add_words("CAR", "CAT", "CARD", "CART")
    lexicon2 = Lexicon()
    lexicon2.add_words("CAT")
    print(lexicon2)
    grid = [["A", "A", "R"], ["T", "C", "D"]]
    words = list(search_grid(grid, lexicon))
    # Reversing so it prints results out in the order shown in the pdf
    words.reverse()
    print(words)
Beispiel #4
0
 def ensure_deepcopy_works(self):
     l = Lexicon()
     l['foo'] = 'bar'
     eq_(l.foo, 'bar')
     l2 = copy.deepcopy(l)
     l2.foo = 'biz'
     assert l2.foo != l.foo
Beispiel #5
0
 def dir_only_shows_real_keys(self):
     "dir() only shows real keys-as-attrs, not aliases"
     a = Lexicon({'key1': 'val1', 'key2': 'val2'})
     a.alias('myalias', 'key1')
     assert 'key1' in dir(a)
     assert 'key2' in dir(a)
     assert 'myalias' not in dir(a)
Beispiel #6
0
def __main__():
    trainingCorpus = ParsedConllFile(keepMalformed=False, projectivize=True)

    trainingCorpus.read( \
        open(trainingFile, 'r',
            encoding='utf-8').read())

    # make fake model params, enough for lexicon builder
    # we still need feature_maps to use ParserState
    modelParams = ModelParameters('')
    modelParams.trainingFile = trainingFile
    modelParams.cfg = {'projectivizeTrainingSet': True}

    lexicon = Lexicon(modelParams)
    lexicon.compute()

    sentence = trainingCorpus.sentences[0]

    parser_state = ParserState(sentence, lexicon.getFeatureMaps())

    # necessary for initializing and pushing root
    # (only initialize transition_state_class once!)
    # keep arc_state in sync with parser_state
    arc_state = transition_state_class(parser_state)

    dynamicOracleTrainTest(parser_state)
Beispiel #7
0
 def test_is_prefix(self):
     lexicon = Lexicon()
     lexicon.add_words("CAT")
     self.assertTrue(lexicon.is_prefix("CAT"))
     self.assertTrue(lexicon.is_prefix("CA"))
     self.assertFalse(lexicon.is_prefix("DOG"))
     self.assertFalse(lexicon.is_prefix(""))
Beispiel #8
0
def training():
    """takes the parallel corpus and sends it to lexicon"""
    filename = sys.argv[1]  # todo: look at "sys.argv" in the doc
    my_lexicon = Lexicon(filename)  # creates the Lexicon object "my_lexicon"
    my_lexicon.readfile()  # creates large lexicon from parallel file
    my_lexicon.minimize()  # picks most frequent word as value
    return my_lexicon
Beispiel #9
0
 def ensure_deepcopy_works(self):
     lex = Lexicon()
     lex['foo'] = 'bar'
     eq_(lex.foo, 'bar')
     lex2 = copy.deepcopy(lex)
     lex2.foo = 'biz'
     assert lex2.foo != lex.foo
def get_manual_lexicon():
    DAYS_OF_WEEK = [(s, '%s:_da' % s)
                    for s in ('monday', 'tuesday', 'wednesday', 'thursday',
                              'friday', 'saturday', 'sunday')]
    # For dates
    WORD_NUMBERS = [('one', '1:_dn'), ('two', '2:_dn'), ('three', '3:_dn'),
                    ('four', '4:_dn'), ('five', '5:_dn'), ('six', '6:_dn'),
                    ('seven', '7:_dn'), ('eight', '8:_dn'), ('nine', '9:_dn'),
                    ('ten', '10:_dn'), ('eleven', '11:_dn'),
                    ('twelve', '12:_dn'), ('thirteen', '13:_dn'),
                    ('fourteen', '14:_dn'), ('fifteen', '15:_dn'),
                    ('sixteen', '16:_dn'), ('seventeen', '17:_dn'),
                    ('eighteen', '18:_dn'), ('nineteen', '19:_dn'),
                    ('twenty', '20:_dn'), ('twenty one', '21:_dn'),
                    ('twenty two', '22:_dn'), ('twenty three', '23:_dn'),
                    ('twenty four', '24:_dn'), ('twenty five', '25:_dn'),
                    ('twenty six', '26:_dn'), ('twenty seven', '27:_dn'),
                    ('twenty eight', '28:_dn'), ('twenty nine', '29:_dn'),
                    ('thirty', '30:_dn'), ('thirty one', '31:_dn')]
    ORDINAL_NUMBERS = [
        ('second', '2:_dn'), ('third', '3:_dn'), ('fourth', '4:_dn'),
        ('fifth', '5:_dn'), ('sixth', '6:_dn'), ('seventh', '7:_dn'),
        ('eighth', '8:_dn'), ('ninth', '9:_dn'), ('tenth', '10:_dn'),
        ('eleventh', '11:_dn'), ('twelfth', '12:_dn'),
        ('thirteenth', '13:_dn'), ('fourteenth', '14:_dn'),
        ('fifteenth', '15:_dn'), ('sixteenth', '16:_dn'),
        ('seventeenth', '17:_dn'), ('eighteenth', '18:_dn'),
        ('nineteenth', '19:_dn'), ('twentieth', '20:_dn'),
        ('twenty first', '21:_dn'), ('twenty second', '22:_dn'),
        ('twenty third', '23:_dn'), ('twenty fourth', '24:_dn'),
        ('twenty fifth', '25:_dn'), ('twenty sixth', '26:_dn'),
        ('twenty seventh', '27:_dn'), ('twenty eighth', '28:_dn'),
        ('twenty ninth', '29:_dn'), ('thirtieth', '30:_dn'),
        ('thirty first', '31:_dn')
    ]  # Prefer first class to "first = 1"
    MEALS = [(m, '%s:_me' % m)
             for m in ('breakfast', 'lunch', 'dinner', 'snack')]

    lex = Lexicon()
    lex.add_entries(read_db('CITY.TAB', 1, 1, '_ci', strip_id=['.']))
    lex.add_entries(DAYS_OF_WEEK)
    lex.add_entries([(x + 's', y)
                     for x, y in DAYS_OF_WEEK])  # Handle "on tuesdays"
    lex.add_entries(
        read_db('AIRLINE.TAB', 0, 1, '_al', strip_name=[', inc.', ', ltd.']))
    handle_times(lex)
    lex.add_entries(read_db('INTERVAL.TAB', 0, 0, '_pd'))
    lex.add_entries(WORD_NUMBERS)
    lex.add_entries(ORDINAL_NUMBERS)
    lex.add_entries(read_db('MONTH.TAB', 1, 1, '_mn'))
    lex.add_entries(
        read_db('AIRPORT.TAB', 0, 1, '_ap', strip_name=[], split_name=['/']))
    lex.add_entries(read_db('COMP_CLS.TAB', 1, 1, '_cl'))
    lex.add_entries(read_db('CLS_SVC.TAB', 0, 0, '_fb', prefix_name='code '))
    handle_flight_numbers(lex)
    lex.add_entries(MEALS)
    handle_dollars(lex)
    return lex
Beispiel #11
0
def read_Tischendorf_WH_Matthew_compare_them():
    lexicon = Lexicon()
    tischrd = read_AccentedTischendorf_MT()
    ma = ManualAnalyses("./manual_analyses.txt")
    #whrd = read_WH_writeMQL()
    whrd = read_WH_MT();
    #trstephrd = read_Stephanus()
    #byzrd = read_Byzantine()
    #lexicon = byzrd.produceLexicon(lexicon)
    #lexicon = trstephrd.produceLexicon(lexicon)
    whrd.compareTischendorf(tischrd, lexicon, ma)
    tischrd.applyMappings()    
    tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA)
    lexicon = whrd.lexicon
    lexicon.writeLexicon("lexicon_nonunique.txt", False)
    tischlexicon = Lexicon()
    tischrd.produceLexicon(tischlexicon).writeLexicon("tischlexicon_nonunique.txt", False)
    return tischrd
Beispiel #12
0
def get_ccg_lexicon():
    lexicon = Lexicon()
    filename = os.path.join(LEXICON_DIR, 'geo-lexicon.txt')
    entries = []
    with open(filename) as f:
        for line in f:
            x, y = line.strip().split(' :- NP : ')
            entries.append((x, y))
    lexicon.add_entries(entries, False)
    return lexicon
def process_xml_wiki(xmlf):
    process_pool = Pool(processes=PROCESS_AMOUNT)
    lexicon_db = Lexicon()
    with click.progressbar(
            process_pool.map(extract_mention_links_categories,
                             extract_page(xmlf)),
            label='obtaining mentions and links') as mention_link_progress_bar:
        for source_uri, lexicon, links, categories_list in mention_link_progress_bar:
            lexicon_db.insert_categories_uri(source_uri, categories_list)
            lexicon_db.insert_links_uri(source_uri, links)
            lexicon_db.insert_mentions_uris(lexicon)
def get_ccg_lexicon():
    lexicon = Lexicon()
    filename = os.path.join(DB_DIR, 'lexicon.txt')
    entries = []
    with open(filename) as f:
        for line in f:
            x, y = line.strip().split(' :- NP : ')
            y = y.replace(':', ':_')
            entries.append((x, y))
    lexicon.add_entries(entries)
    return lexicon
Beispiel #15
0
def read_Tischendorf_WH_compare_them():
    lexicon = Lexicon()
    tischrd = read_AccentedTischendorf()
    ma = ManualAnalyses("./manual_analyses.txt")
    #whrd = read_WH_writeMQL()
    whrd = read_WH()
    #trstephrd = read_Stephanus()
    #byzrd = read_Byzantine()
    #lexicon = byzrd.produceLexicon(lexicon)
    #lexicon = trstephrd.produceLexicon(lexicon)
    whrd.compareTischendorf(tischrd, lexicon, ma)
    tischrd.applyMappings()
    tischrd.applyANLEXLemmaDictionary(
        "../text/lemmas/nonstrongs_forms_tags_lemmas.txt")
    tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA)
    lexicon = whrd.lexicon
    lexicon.writeLexicon("lexicon_nonunique.txt", False)
    tischlexicon = Lexicon()
    tischrd.produceLexicon(tischlexicon).writeLexicon(
        "tischlexicon_nonunique.txt", False)
    return tischrd
Beispiel #16
0
    def build(self, commons_path, corpora_path):
        # Prepare lexical dictionaries.
        self.words = Lexicon(self.words_normalize_digits)
        self.suffix = Lexicon(self.words_normalize_digits, oov_item=None)

        # Initialize training corpus.
        corpora = Corpora(corpora_path, commons_path)

        # Collect word and affix lexicons.
        for document in corpora:
            for token in document.tokens:
                word = token.word
                self.words.add(word)
                for s in self.get_suffixes(word):
                    assert type(s) is str
                    self.suffix.add(s)
        print "Words:", self.words.size(), "items in lexicon, including OOV"
        print "Suffix:", self.suffix.size(), "items in lexicon"

        # Load common store, but not freeze it yet. We will add the action table
        # and cascade specification to it.
        self.commons_path = commons_path
        self.commons = sling.Store()
        self.commons.load(commons_path)
        schema = sling.DocumentSchema(self.commons)

        # Prepare action table and cascade.
        self._build_action_table(corpora)
        self.cascade = cascade.ShiftMarkCascade(self.actions)
        print self.cascade

        # Save cascade specification in commons.
        _ = self.cascade.as_frame(self.commons,
                                  delegate_cell_prefix="delegate")

        # Freeze the common store.
        self.commons.freeze()

        # Add feature specs.
        self._specify_features()
Beispiel #17
0
    def make_lexicon(self):
        lexicon = Lexicon()

        the = Lexical("ART", {"ROOT": "?the", "AGR": "?v"})
        the.set_variable("?the", ["the"])
        the.set_variable("?v", ["3s", "3p"])
        lexicon.add_word("the", [the])

        dog = Lexical("N", {"ROOT": "?dog1", "AGR": "?3s"})
        dog.set_variable("?3s", ["3s"])
        dog.set_variable("?dog1", ["DOG1"])
        lexicon.add_word("dog", [dog])

        return lexicon
Beispiel #18
0
 def train(self, train_trees):
     self.lexicon = Lexicon(train_trees)
     self.known_parses = {}
     self.span_to_categories = {}
     for train_tree in train_trees:
         tags = train_tree.get_preterminal_yield()
         tags = tuple(tags)  # because lists are not hashable, but tuples are
         if tags not in self.known_parses:
             self.known_parses[tags] = {}
         if train_tree not in self.known_parses[tags]:
             self.known_parses[tags][train_tree] = 1
         else:
             self.known_parses[tags][train_tree] += 1
         self.tally_spans(train_tree, 0)
Beispiel #19
0
 def __init__(self, contexts=(), initial=None, ignore_unknown=False):
     self.initial = initial
     self.contexts = Lexicon()
     self.ignore_unknown = ignore_unknown
     for context in contexts:
         debug("Adding {}".format(context))
         if not context.name:
             raise ValueError("Non-initial contexts must have names.")
         exists = "A context named/aliased {!r} is already in this parser!"
         if context.name in self.contexts:
             raise ValueError(exists.format(context.name))
         self.contexts[context.name] = context
         for alias in context.aliases:
             if alias in self.contexts:
                 raise ValueError(exists.format(alias))
             self.contexts.alias(alias, to=context.name)
Beispiel #20
0
 def __init__(self, contextos=(), inicial=None, ignorar_desconocido=False):
     self.inicial = inicial
     self.contextos = Lexicon()
     self.ignorar_desconocido = ignorar_desconocido
     for contexto in contextos:
         debug("Añadiendo {}".format(contexto))
         if not contexto.nombre:
             raise ValueError(
                 "Los contextos no-iniciales deben tener nombres.")
         exists = "Un contexto llamado/alias {!r} ya esta en este analizador!"
         if contexto.nombre in self.contextos:
             raise ValueError(exists.format(contexto.nombre))
         self.contextos[contexto.nombre] = contexto
         for alias in contexto.alias:
             if alias in self.contextos:
                 raise ValueError(exists.format(alias))
             self.contextos.alias(alias, to=contexto.nombre)
Beispiel #21
0
def get_lexicon_from_raw_lexicon_then_write(basename, newname):
    filename = os.path.join(LEXICON_DIR, basename)
    newfilename = os.path.join(LEXICON_DIR, newname)
    lex = Lexicon()
    entries = []
    with open(filename) as f:
        for line in f:
            lexicon_tuple = parse_entry(line)
            name = lexicon_tuple[0]
            entity = normalize_entity(lexicon_tuple[1])
            if entity == '':
                continue
            entries.append((name, entity))
    lex.add_entries(entries, False)
    with open(newfilename, 'w') as f:
        for name, entity in lex.entries.items():
            #print('%s :- NP : %s' % (name, entity), file=f)
            pass
    return lex
Beispiel #22
0
def makeGold(world):
    english = {
        "baby": "baby",
        "bigbird": "bird",
        "bird": "bird",
        "books": "book",
        "bunnyrabbit": "bunny",
        "cows": "cow",
        "moocows": "cow",
        "duckie": "duck",
        "hand": "hand",
        "kitty": "kitty",
        "kittycats": "kitty",
        "lambie": "lamb",
        "pig": "pig",
        "piggies": "pig",
        "ring": "ring",
        "sheep": "sheep",
        "birdie": "duck",
        "bear": "bear",
        "bigbirds": "bird",
        "book": "book",
        "cow": "cow",
        "moocow": "cow",
        "duck": "duck",
        "eyes": "eyes",
        "hat": "hat",
        "kittycat": "kitty",
        "lamb": "lamb",
        "mirror": "mirror",
        "piggie": "pig",
        "rattle": "rattle",
        "rings": "ring",
        "bunnies": "bunny",
        "bird": "duck"
    }
    gold = Lexicon()
    for key in english:
        gold.words.append(world.words_key.index(key))
        gold.objects.append(world.objects_key.index(english[key]))
    return gold
Beispiel #23
0
 def __init__(self, kind, name, actionName, fields, radius, cutoff,
              options):
     self.name = name
     self.kind = kind
     self.actionName = actionName
     self.fields = fields
     self.radius = int(radius)
     self.cutoff = int(cutoff)
     self.options = options
     if kind == "lex":
         if self.options != {}:
             sys.stderr.write('Lexicon features do not yet support options')
             sys.exit(-1)
         self.lexicon = Lexicon(actionName)
     elif kind in ("token", "sentence"):
         if actionName not in features.__dict__:
             sys.stderr.write("Unknown operator named " + actionName + "\n")
             sys.exit(-1)
         self.function = features.__dict__[actionName]
     else:
         assert False
Beispiel #24
0
 def _load_lexicon(
     self,
     src_vocab: List[str],
     trg_vocab: List[str],
     align_model: str,
     threshold: float = 0.0,
     include_special_tokens: bool = False,
 ) -> Lexicon:
     lexicon = Lexicon()
     model_path = self.model_dir / f"src_trg_{align_model}.t{self.file_suffix}"
     for line in load_corpus(model_path):
         src_index_str, trg_index_str, prob_str = line.split(maxsplit=3)
         src_index = int(src_index_str)
         trg_index = int(trg_index_str)
         if include_special_tokens or (src_index > 1 and trg_index > 1):
             src_word = src_vocab[src_index]
             trg_word = trg_vocab[trg_index]
             prob = float(prob_str)
             if prob > threshold:
                 lexicon[src_word, trg_word] = prob
     return lexicon
Beispiel #25
0
 def attributes_work(self):
     lex = Lexicon()
     lex.foo = 'bar'
     eq_(lex['foo'], lex.foo)
Beispiel #26
0
    def __init__(self, *args, **kwargs):
        """
        Create a new task collection/namespace.

        `.Collection` offers a set of methods for building a collection of
        tasks from scratch, plus a convenient constructor wrapping said API.

        In either case:

        * the first positional argument may be a string, which (if given) is
          used as the collection's default name when performing namespace
          lookups;
        * a ``loaded_from`` keyword argument may be given, which sets metadata
          indicating the filesystem path the collection was loaded from. This
          is used as a guide when loading per-project :ref:`configuration files
          <config-hierarchy>`.

        **The method approach**

        May initialize with no arguments and use methods (e.g.
        `.add_task`/`.add_collection`) to insert objects::

            c = Collection()
            c.add_task(some_task)

        If an initial string argument is given, it is used as the default name
        for this collection, should it be inserted into another collection as a
        sub-namespace::

            docs = Collection('docs')
            docs.add_task(doc_task)
            ns = Collection()
            ns.add_task(top_level_task)
            ns.add_collection(docs)
            # Valid identifiers are now 'top_level_task' and 'docs.doc_task'
            # (assuming the task objects were actually named the same as the
            # variables we're using :))

        For details, see the API docs for the rest of the class.

        **The constructor approach**

        All ``*args`` given to `.Collection` (besides the abovementioned
        optional positional 'name' argument and ``loaded_from`` kwarg) are
        expected to be `.Task` or `.Collection` instances which will be passed
        to `.add_task`/`.add_collection` as appropriate. Module objects are
        also valid (as they are for `.add_collection`). For example, the below
        snippet results in the same two task identifiers as the one above::

            ns = Collection(top_level_task, Collection('docs', doc_task))

        If any ``**kwargs`` are given, the keywords are used as the initial
        name arguments for the respective values::

            ns = Collection(
                top_level_task=some_other_task,
                docs=Collection(doc_task)
            )

        That's exactly equivalent to::

            docs = Collection(doc_task)
            ns = Collection()
            ns.add_task(some_other_task, 'top_level_task')
            ns.add_collection(docs, 'docs')

        See individual methods' API docs for details.
        """
        # Initialize
        self.tasks = Lexicon()
        self.collections = Lexicon()
        self.default = None
        self.name = None
        self._configuration = {}
        # Name if applicable
        args = list(args)
        if args and isinstance(args[0], six.string_types):
            self.name = args.pop(0)
        # Specific kwargs if applicable
        self.loaded_from = kwargs.pop('loaded_from', None)
        # Dispatch args/kwargs
        for arg in args:
            self._add_object(arg)
        # Dispatch kwargs
        for name, obj in six.iteritems(kwargs):
            self._add_object(obj, name)
Beispiel #27
0
 def aliased_real_attributes_do_not_override_real_attributes(self):
     lex = Lexicon()
     lex.alias('get', to='notget')
     lex.notget = 'value'
     assert callable(lex.get)
     assert lex.get != 'value'
Beispiel #28
0
 def aliases_appear_in_attributes(self):
     lex = Lexicon()
     lex.alias('foo', to='bar')
     lex.foo = 'value'
     assert lex.foo == lex.bar == lex['foo'] == lex['bar'] == 'value'
Beispiel #29
0
 def aliases_work(self):
     lex = Lexicon()
     lex.alias('foo', to='bar')
     lex['bar'] = 'value'
     assert lex['foo'] == lex['bar'] == 'value'
    file_log_handler.setFormatter(file_log_formatter)
    logger.addHandler(file_log_handler)

    feature_tables_dir_path = join(dir_name, "tests/fixtures/feature_tables")
    constraint_sets_dir_path = join(dir_name, "tests/fixtures/constraint_sets")

    feature_table_file_path = join(feature_tables_dir_path,
                                   current_simulation.feature_table_file_name)
    feature_table = FeatureTable.load(feature_table_file_path)

    constraint_set_file_path = join(
        constraint_sets_dir_path, current_simulation.constraint_set_file_name)
    constraint_set = ConstraintSet.load(constraint_set_file_path)

    corpus = Corpus(current_simulation.corpus)

    data = corpus.get_words()
    max_word_length_in_data = max([len(word) for word in data])
    lexicon = Lexicon(data, max_word_length_in_data)

    grammar = Grammar(constraint_set, lexicon)
    hypothesis = Hypothesis(grammar, data)

    if hasattr(current_simulation, "target_energy"):
        target_energy = current_simulation.target_energy
    else:
        target_energy = None

    simulated_annealing = SimulatedAnnealing(hypothesis, target_energy)
    simulated_annealing.run()