def __init__(self, nombre=None, alias=(), args=()): """ Crea un nuevo `` AnalizadorDeContexto llamado ``nombre``, con ``alias``. ``nombre`` es opcional y deberĆa ser una cadena si se proporciona. Se usa para diferenciar los objetos AnalizadorDeContexto, y para usarlos en un Analizador al determinar quĆ© porciĆ³n de entrada podrĆa pertenecer a un AnalizadorDeContexto dado. ``alias`` tambiĆ©n es opcional y deberĆa ser un iterable que contenga cadenas. El anĆ”lisis respetarĆ” cualquier alias cuando intente "encontrar" un contexto dado en su entrada. Puede dar uno o mĆ”s ``args``, que es una alternativa rĆ”pida a llamar a ``para arg en args: self.agregar_arg (arg)`` despuĆ©s de la inicializaciĆ³n. """ self.args = Lexicon() self.args_posicionales = [] self.banderas = Lexicon() self.banderas_inversas = {} # No need for Lexicon here self.nombre = nombre self.alias = alias for arg in args: self.agregar_arg(arg)
def __init__(self, name=None, aliases=(), args=(), vararg=None): """ Create a new ``ParserContext`` named ``name``, with ``aliases``. ``name`` is optional, and should be a string if given. It's used to tell ParserContext objects apart, and for use in a Parser when determining what chunk of input might belong to a given ParserContext. ``aliases`` is also optional and should be an iterable containing strings. Parsing will honor any aliases when trying to "find" a given context in its input. May give one or more ``args``, which is a quick alternative to calling ``for arg in args: self.add_arg(arg)`` after initialization. """ self.args = Lexicon() self.positional_args = [] self.flags = Lexicon() self.inverse_flags = {} # No need for Lexicon here self.name = name self.aliases = aliases for arg in args: self.add_arg(arg) if vararg: self.varargs = [] else: self.varargs = None
def main(args): lexicon = Lexicon() lexicon.add_words("CAR", "CAT", "CARD", "CART") lexicon2 = Lexicon() lexicon2.add_words("CAT") print(lexicon2) grid = [["A", "A", "R"], ["T", "C", "D"]] words = list(search_grid(grid, lexicon)) # Reversing so it prints results out in the order shown in the pdf words.reverse() print(words)
def ensure_deepcopy_works(self): l = Lexicon() l['foo'] = 'bar' eq_(l.foo, 'bar') l2 = copy.deepcopy(l) l2.foo = 'biz' assert l2.foo != l.foo
def dir_only_shows_real_keys(self): "dir() only shows real keys-as-attrs, not aliases" a = Lexicon({'key1': 'val1', 'key2': 'val2'}) a.alias('myalias', 'key1') assert 'key1' in dir(a) assert 'key2' in dir(a) assert 'myalias' not in dir(a)
def __main__(): trainingCorpus = ParsedConllFile(keepMalformed=False, projectivize=True) trainingCorpus.read( \ open(trainingFile, 'r', encoding='utf-8').read()) # make fake model params, enough for lexicon builder # we still need feature_maps to use ParserState modelParams = ModelParameters('') modelParams.trainingFile = trainingFile modelParams.cfg = {'projectivizeTrainingSet': True} lexicon = Lexicon(modelParams) lexicon.compute() sentence = trainingCorpus.sentences[0] parser_state = ParserState(sentence, lexicon.getFeatureMaps()) # necessary for initializing and pushing root # (only initialize transition_state_class once!) # keep arc_state in sync with parser_state arc_state = transition_state_class(parser_state) dynamicOracleTrainTest(parser_state)
def test_is_prefix(self): lexicon = Lexicon() lexicon.add_words("CAT") self.assertTrue(lexicon.is_prefix("CAT")) self.assertTrue(lexicon.is_prefix("CA")) self.assertFalse(lexicon.is_prefix("DOG")) self.assertFalse(lexicon.is_prefix(""))
def training(): """takes the parallel corpus and sends it to lexicon""" filename = sys.argv[1] # todo: look at "sys.argv" in the doc my_lexicon = Lexicon(filename) # creates the Lexicon object "my_lexicon" my_lexicon.readfile() # creates large lexicon from parallel file my_lexicon.minimize() # picks most frequent word as value return my_lexicon
def ensure_deepcopy_works(self): lex = Lexicon() lex['foo'] = 'bar' eq_(lex.foo, 'bar') lex2 = copy.deepcopy(lex) lex2.foo = 'biz' assert lex2.foo != lex.foo
def get_manual_lexicon(): DAYS_OF_WEEK = [(s, '%s:_da' % s) for s in ('monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday')] # For dates WORD_NUMBERS = [('one', '1:_dn'), ('two', '2:_dn'), ('three', '3:_dn'), ('four', '4:_dn'), ('five', '5:_dn'), ('six', '6:_dn'), ('seven', '7:_dn'), ('eight', '8:_dn'), ('nine', '9:_dn'), ('ten', '10:_dn'), ('eleven', '11:_dn'), ('twelve', '12:_dn'), ('thirteen', '13:_dn'), ('fourteen', '14:_dn'), ('fifteen', '15:_dn'), ('sixteen', '16:_dn'), ('seventeen', '17:_dn'), ('eighteen', '18:_dn'), ('nineteen', '19:_dn'), ('twenty', '20:_dn'), ('twenty one', '21:_dn'), ('twenty two', '22:_dn'), ('twenty three', '23:_dn'), ('twenty four', '24:_dn'), ('twenty five', '25:_dn'), ('twenty six', '26:_dn'), ('twenty seven', '27:_dn'), ('twenty eight', '28:_dn'), ('twenty nine', '29:_dn'), ('thirty', '30:_dn'), ('thirty one', '31:_dn')] ORDINAL_NUMBERS = [ ('second', '2:_dn'), ('third', '3:_dn'), ('fourth', '4:_dn'), ('fifth', '5:_dn'), ('sixth', '6:_dn'), ('seventh', '7:_dn'), ('eighth', '8:_dn'), ('ninth', '9:_dn'), ('tenth', '10:_dn'), ('eleventh', '11:_dn'), ('twelfth', '12:_dn'), ('thirteenth', '13:_dn'), ('fourteenth', '14:_dn'), ('fifteenth', '15:_dn'), ('sixteenth', '16:_dn'), ('seventeenth', '17:_dn'), ('eighteenth', '18:_dn'), ('nineteenth', '19:_dn'), ('twentieth', '20:_dn'), ('twenty first', '21:_dn'), ('twenty second', '22:_dn'), ('twenty third', '23:_dn'), ('twenty fourth', '24:_dn'), ('twenty fifth', '25:_dn'), ('twenty sixth', '26:_dn'), ('twenty seventh', '27:_dn'), ('twenty eighth', '28:_dn'), ('twenty ninth', '29:_dn'), ('thirtieth', '30:_dn'), ('thirty first', '31:_dn') ] # Prefer first class to "first = 1" MEALS = [(m, '%s:_me' % m) for m in ('breakfast', 'lunch', 'dinner', 'snack')] lex = Lexicon() lex.add_entries(read_db('CITY.TAB', 1, 1, '_ci', strip_id=['.'])) lex.add_entries(DAYS_OF_WEEK) lex.add_entries([(x + 's', y) for x, y in DAYS_OF_WEEK]) # Handle "on tuesdays" lex.add_entries( read_db('AIRLINE.TAB', 0, 1, '_al', strip_name=[', inc.', ', ltd.'])) handle_times(lex) lex.add_entries(read_db('INTERVAL.TAB', 0, 0, '_pd')) lex.add_entries(WORD_NUMBERS) lex.add_entries(ORDINAL_NUMBERS) lex.add_entries(read_db('MONTH.TAB', 1, 1, '_mn')) lex.add_entries( read_db('AIRPORT.TAB', 0, 1, '_ap', strip_name=[], split_name=['/'])) lex.add_entries(read_db('COMP_CLS.TAB', 1, 1, '_cl')) lex.add_entries(read_db('CLS_SVC.TAB', 0, 0, '_fb', prefix_name='code ')) handle_flight_numbers(lex) lex.add_entries(MEALS) handle_dollars(lex) return lex
def read_Tischendorf_WH_Matthew_compare_them(): lexicon = Lexicon() tischrd = read_AccentedTischendorf_MT() ma = ManualAnalyses("./manual_analyses.txt") #whrd = read_WH_writeMQL() whrd = read_WH_MT(); #trstephrd = read_Stephanus() #byzrd = read_Byzantine() #lexicon = byzrd.produceLexicon(lexicon) #lexicon = trstephrd.produceLexicon(lexicon) whrd.compareTischendorf(tischrd, lexicon, ma) tischrd.applyMappings() tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA) lexicon = whrd.lexicon lexicon.writeLexicon("lexicon_nonunique.txt", False) tischlexicon = Lexicon() tischrd.produceLexicon(tischlexicon).writeLexicon("tischlexicon_nonunique.txt", False) return tischrd
def get_ccg_lexicon(): lexicon = Lexicon() filename = os.path.join(LEXICON_DIR, 'geo-lexicon.txt') entries = [] with open(filename) as f: for line in f: x, y = line.strip().split(' :- NP : ') entries.append((x, y)) lexicon.add_entries(entries, False) return lexicon
def process_xml_wiki(xmlf): process_pool = Pool(processes=PROCESS_AMOUNT) lexicon_db = Lexicon() with click.progressbar( process_pool.map(extract_mention_links_categories, extract_page(xmlf)), label='obtaining mentions and links') as mention_link_progress_bar: for source_uri, lexicon, links, categories_list in mention_link_progress_bar: lexicon_db.insert_categories_uri(source_uri, categories_list) lexicon_db.insert_links_uri(source_uri, links) lexicon_db.insert_mentions_uris(lexicon)
def get_ccg_lexicon(): lexicon = Lexicon() filename = os.path.join(DB_DIR, 'lexicon.txt') entries = [] with open(filename) as f: for line in f: x, y = line.strip().split(' :- NP : ') y = y.replace(':', ':_') entries.append((x, y)) lexicon.add_entries(entries) return lexicon
def read_Tischendorf_WH_compare_them(): lexicon = Lexicon() tischrd = read_AccentedTischendorf() ma = ManualAnalyses("./manual_analyses.txt") #whrd = read_WH_writeMQL() whrd = read_WH() #trstephrd = read_Stephanus() #byzrd = read_Byzantine() #lexicon = byzrd.produceLexicon(lexicon) #lexicon = trstephrd.produceLexicon(lexicon) whrd.compareTischendorf(tischrd, lexicon, ma) tischrd.applyMappings() tischrd.applyANLEXLemmaDictionary( "../text/lemmas/nonstrongs_forms_tags_lemmas.txt") tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA) lexicon = whrd.lexicon lexicon.writeLexicon("lexicon_nonunique.txt", False) tischlexicon = Lexicon() tischrd.produceLexicon(tischlexicon).writeLexicon( "tischlexicon_nonunique.txt", False) return tischrd
def build(self, commons_path, corpora_path): # Prepare lexical dictionaries. self.words = Lexicon(self.words_normalize_digits) self.suffix = Lexicon(self.words_normalize_digits, oov_item=None) # Initialize training corpus. corpora = Corpora(corpora_path, commons_path) # Collect word and affix lexicons. for document in corpora: for token in document.tokens: word = token.word self.words.add(word) for s in self.get_suffixes(word): assert type(s) is str self.suffix.add(s) print "Words:", self.words.size(), "items in lexicon, including OOV" print "Suffix:", self.suffix.size(), "items in lexicon" # Load common store, but not freeze it yet. We will add the action table # and cascade specification to it. self.commons_path = commons_path self.commons = sling.Store() self.commons.load(commons_path) schema = sling.DocumentSchema(self.commons) # Prepare action table and cascade. self._build_action_table(corpora) self.cascade = cascade.ShiftMarkCascade(self.actions) print self.cascade # Save cascade specification in commons. _ = self.cascade.as_frame(self.commons, delegate_cell_prefix="delegate") # Freeze the common store. self.commons.freeze() # Add feature specs. self._specify_features()
def make_lexicon(self): lexicon = Lexicon() the = Lexical("ART", {"ROOT": "?the", "AGR": "?v"}) the.set_variable("?the", ["the"]) the.set_variable("?v", ["3s", "3p"]) lexicon.add_word("the", [the]) dog = Lexical("N", {"ROOT": "?dog1", "AGR": "?3s"}) dog.set_variable("?3s", ["3s"]) dog.set_variable("?dog1", ["DOG1"]) lexicon.add_word("dog", [dog]) return lexicon
def train(self, train_trees): self.lexicon = Lexicon(train_trees) self.known_parses = {} self.span_to_categories = {} for train_tree in train_trees: tags = train_tree.get_preterminal_yield() tags = tuple(tags) # because lists are not hashable, but tuples are if tags not in self.known_parses: self.known_parses[tags] = {} if train_tree not in self.known_parses[tags]: self.known_parses[tags][train_tree] = 1 else: self.known_parses[tags][train_tree] += 1 self.tally_spans(train_tree, 0)
def __init__(self, contexts=(), initial=None, ignore_unknown=False): self.initial = initial self.contexts = Lexicon() self.ignore_unknown = ignore_unknown for context in contexts: debug("Adding {}".format(context)) if not context.name: raise ValueError("Non-initial contexts must have names.") exists = "A context named/aliased {!r} is already in this parser!" if context.name in self.contexts: raise ValueError(exists.format(context.name)) self.contexts[context.name] = context for alias in context.aliases: if alias in self.contexts: raise ValueError(exists.format(alias)) self.contexts.alias(alias, to=context.name)
def __init__(self, contextos=(), inicial=None, ignorar_desconocido=False): self.inicial = inicial self.contextos = Lexicon() self.ignorar_desconocido = ignorar_desconocido for contexto in contextos: debug("AƱadiendo {}".format(contexto)) if not contexto.nombre: raise ValueError( "Los contextos no-iniciales deben tener nombres.") exists = "Un contexto llamado/alias {!r} ya esta en este analizador!" if contexto.nombre in self.contextos: raise ValueError(exists.format(contexto.nombre)) self.contextos[contexto.nombre] = contexto for alias in contexto.alias: if alias in self.contextos: raise ValueError(exists.format(alias)) self.contextos.alias(alias, to=contexto.nombre)
def get_lexicon_from_raw_lexicon_then_write(basename, newname): filename = os.path.join(LEXICON_DIR, basename) newfilename = os.path.join(LEXICON_DIR, newname) lex = Lexicon() entries = [] with open(filename) as f: for line in f: lexicon_tuple = parse_entry(line) name = lexicon_tuple[0] entity = normalize_entity(lexicon_tuple[1]) if entity == '': continue entries.append((name, entity)) lex.add_entries(entries, False) with open(newfilename, 'w') as f: for name, entity in lex.entries.items(): #print('%s :- NP : %s' % (name, entity), file=f) pass return lex
def makeGold(world): english = { "baby": "baby", "bigbird": "bird", "bird": "bird", "books": "book", "bunnyrabbit": "bunny", "cows": "cow", "moocows": "cow", "duckie": "duck", "hand": "hand", "kitty": "kitty", "kittycats": "kitty", "lambie": "lamb", "pig": "pig", "piggies": "pig", "ring": "ring", "sheep": "sheep", "birdie": "duck", "bear": "bear", "bigbirds": "bird", "book": "book", "cow": "cow", "moocow": "cow", "duck": "duck", "eyes": "eyes", "hat": "hat", "kittycat": "kitty", "lamb": "lamb", "mirror": "mirror", "piggie": "pig", "rattle": "rattle", "rings": "ring", "bunnies": "bunny", "bird": "duck" } gold = Lexicon() for key in english: gold.words.append(world.words_key.index(key)) gold.objects.append(world.objects_key.index(english[key])) return gold
def __init__(self, kind, name, actionName, fields, radius, cutoff, options): self.name = name self.kind = kind self.actionName = actionName self.fields = fields self.radius = int(radius) self.cutoff = int(cutoff) self.options = options if kind == "lex": if self.options != {}: sys.stderr.write('Lexicon features do not yet support options') sys.exit(-1) self.lexicon = Lexicon(actionName) elif kind in ("token", "sentence"): if actionName not in features.__dict__: sys.stderr.write("Unknown operator named " + actionName + "\n") sys.exit(-1) self.function = features.__dict__[actionName] else: assert False
def _load_lexicon( self, src_vocab: List[str], trg_vocab: List[str], align_model: str, threshold: float = 0.0, include_special_tokens: bool = False, ) -> Lexicon: lexicon = Lexicon() model_path = self.model_dir / f"src_trg_{align_model}.t{self.file_suffix}" for line in load_corpus(model_path): src_index_str, trg_index_str, prob_str = line.split(maxsplit=3) src_index = int(src_index_str) trg_index = int(trg_index_str) if include_special_tokens or (src_index > 1 and trg_index > 1): src_word = src_vocab[src_index] trg_word = trg_vocab[trg_index] prob = float(prob_str) if prob > threshold: lexicon[src_word, trg_word] = prob return lexicon
def attributes_work(self): lex = Lexicon() lex.foo = 'bar' eq_(lex['foo'], lex.foo)
def __init__(self, *args, **kwargs): """ Create a new task collection/namespace. `.Collection` offers a set of methods for building a collection of tasks from scratch, plus a convenient constructor wrapping said API. In either case: * the first positional argument may be a string, which (if given) is used as the collection's default name when performing namespace lookups; * a ``loaded_from`` keyword argument may be given, which sets metadata indicating the filesystem path the collection was loaded from. This is used as a guide when loading per-project :ref:`configuration files <config-hierarchy>`. **The method approach** May initialize with no arguments and use methods (e.g. `.add_task`/`.add_collection`) to insert objects:: c = Collection() c.add_task(some_task) If an initial string argument is given, it is used as the default name for this collection, should it be inserted into another collection as a sub-namespace:: docs = Collection('docs') docs.add_task(doc_task) ns = Collection() ns.add_task(top_level_task) ns.add_collection(docs) # Valid identifiers are now 'top_level_task' and 'docs.doc_task' # (assuming the task objects were actually named the same as the # variables we're using :)) For details, see the API docs for the rest of the class. **The constructor approach** All ``*args`` given to `.Collection` (besides the abovementioned optional positional 'name' argument and ``loaded_from`` kwarg) are expected to be `.Task` or `.Collection` instances which will be passed to `.add_task`/`.add_collection` as appropriate. Module objects are also valid (as they are for `.add_collection`). For example, the below snippet results in the same two task identifiers as the one above:: ns = Collection(top_level_task, Collection('docs', doc_task)) If any ``**kwargs`` are given, the keywords are used as the initial name arguments for the respective values:: ns = Collection( top_level_task=some_other_task, docs=Collection(doc_task) ) That's exactly equivalent to:: docs = Collection(doc_task) ns = Collection() ns.add_task(some_other_task, 'top_level_task') ns.add_collection(docs, 'docs') See individual methods' API docs for details. """ # Initialize self.tasks = Lexicon() self.collections = Lexicon() self.default = None self.name = None self._configuration = {} # Name if applicable args = list(args) if args and isinstance(args[0], six.string_types): self.name = args.pop(0) # Specific kwargs if applicable self.loaded_from = kwargs.pop('loaded_from', None) # Dispatch args/kwargs for arg in args: self._add_object(arg) # Dispatch kwargs for name, obj in six.iteritems(kwargs): self._add_object(obj, name)
def aliased_real_attributes_do_not_override_real_attributes(self): lex = Lexicon() lex.alias('get', to='notget') lex.notget = 'value' assert callable(lex.get) assert lex.get != 'value'
def aliases_appear_in_attributes(self): lex = Lexicon() lex.alias('foo', to='bar') lex.foo = 'value' assert lex.foo == lex.bar == lex['foo'] == lex['bar'] == 'value'
def aliases_work(self): lex = Lexicon() lex.alias('foo', to='bar') lex['bar'] = 'value' assert lex['foo'] == lex['bar'] == 'value'
file_log_handler.setFormatter(file_log_formatter) logger.addHandler(file_log_handler) feature_tables_dir_path = join(dir_name, "tests/fixtures/feature_tables") constraint_sets_dir_path = join(dir_name, "tests/fixtures/constraint_sets") feature_table_file_path = join(feature_tables_dir_path, current_simulation.feature_table_file_name) feature_table = FeatureTable.load(feature_table_file_path) constraint_set_file_path = join( constraint_sets_dir_path, current_simulation.constraint_set_file_name) constraint_set = ConstraintSet.load(constraint_set_file_path) corpus = Corpus(current_simulation.corpus) data = corpus.get_words() max_word_length_in_data = max([len(word) for word in data]) lexicon = Lexicon(data, max_word_length_in_data) grammar = Grammar(constraint_set, lexicon) hypothesis = Hypothesis(grammar, data) if hasattr(current_simulation, "target_energy"): target_energy = current_simulation.target_energy else: target_energy = None simulated_annealing = SimulatedAnnealing(hypothesis, target_energy) simulated_annealing.run()