def __init__(self, name=None, aliases=(), args=(), vararg=None): """ Create a new ``ParserContext`` named ``name``, with ``aliases``. ``name`` is optional, and should be a string if given. It's used to tell ParserContext objects apart, and for use in a Parser when determining what chunk of input might belong to a given ParserContext. ``aliases`` is also optional and should be an iterable containing strings. Parsing will honor any aliases when trying to "find" a given context in its input. May give one or more ``args``, which is a quick alternative to calling ``for arg in args: self.add_arg(arg)`` after initialization. """ self.args = Lexicon() self.positional_args = [] self.flags = Lexicon() self.inverse_flags = {} # No need for Lexicon here self.name = name self.aliases = aliases for arg in args: self.add_arg(arg) if vararg: self.varargs = [] else: self.varargs = None
def __main__(): trainingCorpus = ParsedConllFile(keepMalformed=False, projectivize=True) trainingCorpus.read( \ open(trainingFile, 'r', encoding='utf-8').read()) # make fake model params, enough for lexicon builder # we still need feature_maps to use ParserState modelParams = ModelParameters('') modelParams.trainingFile = trainingFile modelParams.cfg = {'projectivizeTrainingSet': True} lexicon = Lexicon(modelParams) lexicon.compute() sentence = trainingCorpus.sentences[0] parser_state = ParserState(sentence, lexicon.getFeatureMaps()) # necessary for initializing and pushing root # (only initialize transition_state_class once!) # keep arc_state in sync with parser_state arc_state = transition_state_class(parser_state) dynamicOracleTrainTest(parser_state)
class Collection(object): def __init__(self): self.tasks = Lexicon() self.default = None def add_task(self, name, task, aliases=(), default=False): """ Adds callable object ``task`` to this collection under name ``name``. If ``aliases`` is given, will be used to set up additional aliases for this task. ``default`` may be set to ``True`` to set the task as this collection's default invocation. """ self.tasks[name] = task for alias in aliases: self.tasks.alias(alias, to=name) if default: if self.default: msg = "'%s' cannot be the default because '%s' already is!" raise ValueError(msg % (name, self.default)) self.default = name def __getitem__(self, name=None): """ Returns task named ``name``. Honors aliases. If this collection has a default task, it is returned when ``name`` is empty or ``None``. If empty input is given and no task has been selected as the default, ValueError will be raised. """ if not name: if self.default: return self[self.default] else: raise ValueError("This collection has no default task.") return self.tasks[name] def to_contexts(self): """ Returns all contained tasks and subtasks as a list of parser contexts. """ result = [] for name, task in self.tasks.iteritems(): context = Context(name=name, aliases=task.aliases) argspec = task.argspec for name, default in argspec.iteritems(): # Handle arg options opts = {} if default is not None: opts['kind'] = type(default) # Handle aliases (auto shortflags, etc) names = [name] names.extend(argspec.aliases_of(name)) # Create/add the argument context.add_arg(names=names, **opts) result.append(context) return result
def dir_only_shows_real_keys(self): "dir() only shows real keys-as-attrs, not aliases" a = Lexicon({'key1': 'val1', 'key2': 'val2'}) a.alias('myalias', 'key1') assert 'key1' in dir(a) assert 'key2' in dir(a) assert 'myalias' not in dir(a)
def __init__(self, nombre=None, alias=(), args=()): """ Crea un nuevo `` AnalizadorDeContexto llamado ``nombre``, con ``alias``. ``nombre`` es opcional y debería ser una cadena si se proporciona. Se usa para diferenciar los objetos AnalizadorDeContexto, y para usarlos en un Analizador al determinar qué porción de entrada podría pertenecer a un AnalizadorDeContexto dado. ``alias`` también es opcional y debería ser un iterable que contenga cadenas. El análisis respetará cualquier alias cuando intente "encontrar" un contexto dado en su entrada. Puede dar uno o más ``args``, que es una alternativa rápida a llamar a ``para arg en args: self.agregar_arg (arg)`` después de la inicialización. """ self.args = Lexicon() self.args_posicionales = [] self.banderas = Lexicon() self.banderas_inversas = {} # No need for Lexicon here self.nombre = nombre self.alias = alias for arg in args: self.agregar_arg(arg)
def getLexicons(self, l_type='Senti'): lexicon = Lexicon() if l_type == 'Senti': lexicon.getLexiconsFromSentiment() elif l_type == 'PN': lexicon.getLexiconsFromPN() self.lexicons = lexicon.lexicons
class InvertedIndex(): def __init__(self): self.invertedindex = {} self.lexicon = Lexicon() self.tokenizer = Tokenizer() self.doc_reader = DocReader() self.build_index() def build_index(self): #comments? cache = self.doc_reader.get_cache() docs = self.doc_reader.read_docs(cache) print "\nINVERTEDINDEX : Indexing %d documents..\n" % len(docs) for d in range(len(docs)): print "Indexing document '%s'" % (settings.PATH_DOCS + str(d)) self.add_document(docs[d], d) print "Indexed total %d unique terms" % self.lexicon.size() def get_postinglist(self, lex_id): return self.invertedindex[lex_id] def add_document(self, doc, document_id): """FIXME: -Needs doc -Too slow? -Remove stop words -Reduce number of tokens """ tokens = self.tokenizer.tokenize(doc) for t in tokens: lex_id = self.lexicon.lookup(t.get_value()) if(lex_id == settings.INVALID): lex_id = self.lexicon.add_value(t.get_value()) pl = PostingList() pl.append_posting(Posting(document_id, t.get_position())) self.invertedindex[lex_id] = pl else: pl = self.get_postinglist(lex_id) if pl.get_last_posting().get_document_id() != document_id: pl.append_posting(Posting(document_id, t.get_position())) else: p = pl.get_last_posting() p.append_position(t.get_position()) def size(self): return len(self.invertedindex) def debugprint(self): voc = self.lexicon.get_vocabulary() for v in voc: lid = self.lexicon.lookup(v) pl = self.get_postinglist(lid) print "[%s]" % v pl.info()
def create_lexicon(): lexicon = Lexicon() words_file = open(ENGLISH_WORDS, 'r') print '=== CREATING LEXICON ===' print '=== READING WORDS FILE ===' for word in words_file: lexicon.add_word(word) print '=== LEXICON CREATED ===' return lexicon
def get_ccg_lexicon(): lexicon = Lexicon() filename = os.path.join(LEXICON_DIR, 'geo-lexicon.txt') entries = [] with open(filename) as f: for line in f: x, y = line.strip().split(' :- NP : ') entries.append((x, y)) lexicon.add_entries(entries, False) return lexicon
def get_ccg_lexicon(): lexicon = Lexicon() filename = os.path.join(DB_DIR, 'lexicon.txt') entries = [] with open(filename) as f: for line in f: x, y = line.strip().split(' :- NP : ') y = y.replace(':', ':_') entries.append((x, y)) lexicon.add_entries(entries) return lexicon
def test_is_prefix(self): lexicon = Lexicon() lexicon.add_words("CAT") self.assertTrue(lexicon.is_prefix("CAT")) self.assertTrue(lexicon.is_prefix("CA")) self.assertFalse(lexicon.is_prefix("DOG")) self.assertFalse(lexicon.is_prefix(""))
def train(self, train_trees): self.lexicon = Lexicon(train_trees) self.known_parses = {} self.span_to_categories = {} for train_tree in train_trees: tags = train_tree.get_preterminal_yield() tags = tuple(tags) # because lists are not hashable, but tuples are if tags not in self.known_parses: self.known_parses[tags] = {} if train_tree not in self.known_parses[tags]: self.known_parses[tags][train_tree] = 1 else: self.known_parses[tags][train_tree] += 1 self.tally_spans(train_tree, 0)
def __init__(self, code = "zxx"): """ Create a lect object. A I{lect} is language variety; it can either be a spoken or a written form, and a colloquial, mediatic or standard form, and so on. It wraps serialization and high-level features. It contains three independent internal members: - L{lexicon<lexicon>} - L{grammar<grammar>} - L{inflections<inflection>} @type code: str @param code: A language code according to U{ISO<http://www.iso.org>} standard. For the language codes, refer to 639-3 specifications. A country/variety code and a representation system might be added: C{eng-US}, C{esp:ERG}, C{por-BR:IPA} """ self.code = code self.name = u"" self.english_name = "" self.__p_o_s = () self.__lemma_categories = {} self.__categories = {} self.grammar = Grammar(code) self.lexicon = Lexicon() self.inflections = Inflections() self.properties = {"separator" : " ", "capitalization" : "3"} #Lexical and Initials
def __init__(self, contexts=(), initial=None, ignore_unknown=False): self.initial = initial self.contexts = Lexicon() self.ignore_unknown = ignore_unknown for context in contexts: debug("Adding {}".format(context)) if not context.name: raise ValueError("Non-initial contexts must have names.") exists = "A context named/aliased {!r} is already in this parser!" if context.name in self.contexts: raise ValueError(exists.format(context.name)) self.contexts[context.name] = context for alias in context.aliases: if alias in self.contexts: raise ValueError(exists.format(alias)) self.contexts.alias(alias, to=context.name)
def ensure_deepcopy_works(self): l = Lexicon() l['foo'] = 'bar' eq_(l.foo, 'bar') l2 = copy.deepcopy(l) l2.foo = 'biz' assert l2.foo != l.foo
def ensure_deepcopy_works(self): lex = Lexicon() lex['foo'] = 'bar' eq_(lex.foo, 'bar') lex2 = copy.deepcopy(lex) lex2.foo = 'biz' assert lex2.foo != lex.foo
def extract_lexicon(self, out_file_path: Path, threshold: float = 0.0) -> None: src_vocab = self._load_vocab("src") trg_vocab = self._load_vocab("trg") direct_lexicon = self._load_lexicon(src_vocab, trg_vocab, "invswm", threshold=threshold) inverse_lexicon = self._load_lexicon(trg_vocab, src_vocab, "swm", threshold=threshold) lexicon = Lexicon.symmetrize(direct_lexicon, inverse_lexicon, threshold=threshold) lexicon.write(out_file_path)
def __init__(self, contextos=(), inicial=None, ignorar_desconocido=False): self.inicial = inicial self.contextos = Lexicon() self.ignorar_desconocido = ignorar_desconocido for contexto in contextos: debug("Añadiendo {}".format(contexto)) if not contexto.nombre: raise ValueError( "Los contextos no-iniciales deben tener nombres.") exists = "Un contexto llamado/alias {!r} ya esta en este analizador!" if contexto.nombre in self.contextos: raise ValueError(exists.format(contexto.nombre)) self.contextos[contexto.nombre] = contexto for alias in contexto.alias: if alias in self.contextos: raise ValueError(exists.format(alias)) self.contextos.alias(alias, to=contexto.nombre)
def read_Tischendorf_WH_Matthew_compare_them(): lexicon = Lexicon() tischrd = read_AccentedTischendorf_MT() ma = ManualAnalyses("./manual_analyses.txt") #whrd = read_WH_writeMQL() whrd = read_WH_MT(); #trstephrd = read_Stephanus() #byzrd = read_Byzantine() #lexicon = byzrd.produceLexicon(lexicon) #lexicon = trstephrd.produceLexicon(lexicon) whrd.compareTischendorf(tischrd, lexicon, ma) tischrd.applyMappings() tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA) lexicon = whrd.lexicon lexicon.writeLexicon("lexicon_nonunique.txt", False) tischlexicon = Lexicon() tischrd.produceLexicon(tischlexicon).writeLexicon("tischlexicon_nonunique.txt", False) return tischrd
def process_xml_wiki(xmlf): process_pool = Pool(processes=PROCESS_AMOUNT) lexicon_db = Lexicon() with click.progressbar( process_pool.map(extract_mention_links_categories, extract_page(xmlf)), label='obtaining mentions and links') as mention_link_progress_bar: for source_uri, lexicon, links, categories_list in mention_link_progress_bar: lexicon_db.insert_categories_uri(source_uri, categories_list) lexicon_db.insert_links_uri(source_uri, links) lexicon_db.insert_mentions_uris(lexicon)
def main(args): lexicon = Lexicon() lexicon.add_words("CAR", "CAT", "CARD", "CART") lexicon2 = Lexicon() lexicon2.add_words("CAT") print(lexicon2) grid = [["A", "A", "R"], ["T", "C", "D"]] words = list(search_grid(grid, lexicon)) # Reversing so it prints results out in the order shown in the pdf words.reverse() print(words)
def get_lexicon_from_raw_lexicon_then_write(basename, newname): filename = os.path.join(LEXICON_DIR, basename) newfilename = os.path.join(LEXICON_DIR, newname) lex = Lexicon() entries = [] with open(filename) as f: for line in f: lexicon_tuple = parse_entry(line) name = lexicon_tuple[0] entity = normalize_entity(lexicon_tuple[1]) if entity == '': continue entries.append((name, entity)) lex.add_entries(entries, False) with open(newfilename, 'w') as f: for name, entity in lex.entries.items(): #print('%s :- NP : %s' % (name, entity), file=f) pass return lex
def __init__(self, kind, name, actionName, fields, radius, cutoff, options): self.name = name self.kind = kind self.actionName = actionName self.fields = fields self.radius = int(radius) self.cutoff = int(cutoff) self.options = options if kind == "lex": if self.options != {}: sys.stderr.write('Lexicon features do not yet support options') sys.exit(-1) self.lexicon = Lexicon(actionName) elif kind in ("token", "sentence"): if actionName not in features.__dict__: sys.stderr.write("Unknown operator named " + actionName + "\n") sys.exit(-1) self.function = features.__dict__[actionName] else: assert False
def read_Tischendorf_WH_compare_them(): lexicon = Lexicon() tischrd = read_AccentedTischendorf() ma = ManualAnalyses("./manual_analyses.txt") #whrd = read_WH_writeMQL() whrd = read_WH() #trstephrd = read_Stephanus() #byzrd = read_Byzantine() #lexicon = byzrd.produceLexicon(lexicon) #lexicon = trstephrd.produceLexicon(lexicon) whrd.compareTischendorf(tischrd, lexicon, ma) tischrd.applyMappings() tischrd.applyANLEXLemmaDictionary( "../text/lemmas/nonstrongs_forms_tags_lemmas.txt") tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA) lexicon = whrd.lexicon lexicon.writeLexicon("lexicon_nonunique.txt", False) tischlexicon = Lexicon() tischrd.produceLexicon(tischlexicon).writeLexicon( "tischlexicon_nonunique.txt", False) return tischrd
def __init__(self, name=None, aliases=(), args=()): """ Create a new ``Context`` named ``name``, with ``aliases``. ``name`` is optional, and should be a string if given. It's used to tell Context objects apart, and for use in a Parser when determining what chunk of input might belong to a given Context. ``aliases`` is also optional and should be an iterable containing strings. Parsing will honor any aliases when trying to "find" a given context in its input. May give one or more ``args``, which is a quick alternative to calling ``for arg in args: self.add_arg(arg)`` after initialization. """ self.args = Lexicon() self.flags = Lexicon() self.name = name self.aliases = aliases for arg in args: self.add_arg(arg)
def training(): """takes the parallel corpus and sends it to lexicon""" filename = sys.argv[1] # todo: look at "sys.argv" in the doc my_lexicon = Lexicon(filename) # creates the Lexicon object "my_lexicon" my_lexicon.readfile() # creates large lexicon from parallel file my_lexicon.minimize() # picks most frequent word as value return my_lexicon
def build(self, commons_path, corpora_path): # Prepare lexical dictionaries. self.words = Lexicon(self.words_normalize_digits) self.suffix = Lexicon(self.words_normalize_digits, oov_item=None) # Initialize training corpus. corpora = Corpora(corpora_path, commons_path) # Collect word and affix lexicons. for document in corpora: for token in document.tokens: word = token.word self.words.add(word) for s in self.get_suffixes(word): assert type(s) is str self.suffix.add(s) print "Words:", self.words.size(), "items in lexicon, including OOV" print "Suffix:", self.suffix.size(), "items in lexicon" # Load common store, but not freeze it yet. We will add the action table # and cascade specification to it. self.commons_path = commons_path self.commons = sling.Store() self.commons.load(commons_path) schema = sling.DocumentSchema(self.commons) # Prepare action table and cascade. self._build_action_table(corpora) self.cascade = cascade.ShiftMarkCascade(self.actions) print self.cascade # Save cascade specification in commons. _ = self.cascade.as_frame(self.commons, delegate_cell_prefix="delegate") # Freeze the common store. self.commons.freeze() # Add feature specs. self._specify_features()
def __init__(self, cfg): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.read_dep_map(dep_map_fn) self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.word2lemma = {}
class Collection(object): def __init__(self): self.tasks = Lexicon() self.default = None def add_task(self, name, task, aliases=(), default=False): """ Adds callable object ``task`` to this collection under name ``name``. If ``aliases`` is given, will be used to set up additional aliases for this task. ``default`` may be set to ``True`` to set the task as this collection's default invocation. """ self.tasks[name] = task for alias in aliases: self.tasks.alias(alias, to=name) if default: if self.default: msg = "'%s' cannot be the default because '%s' already is!" raise ValueError(msg % (name, self.default)) self.default = name def get(self, name=None): """ Returns task named ``name``. Honors aliases. If this collection has a default task, it is returned when ``name`` is empty or ``None``. If empty input is given and no task has been selected as the default, ValueError will be raised. """ if not name: if self.default: return self.get(self.default) else: raise ValueError("This collection has no default task.") return self.tasks[name]
class Parser(object): def __init__(self, contexts=(), initial=None): self.initial = initial self.contexts = Lexicon() for context in contexts: debug("Adding %s" % context) if not context.name: raise ValueError("Non-initial contexts must have names.") exists = "A context named/aliased %r is already in this parser!" if context.name in self.contexts: raise ValueError(exists % context.name) self.contexts[context.name] = context for alias in context.aliases: if alias in self.contexts: raise ValueError(exists % alias) self.contexts.alias(alias, to=context.name) def parse_argv(self, argv): """ Parse an argv-style token list ``argv``. Returns a list of ``Context`` objects matching the order they were found in the ``argv`` and containing ``Argument`` objects with updated values based on any flags given. Assumes any program name has already been stripped out. Good:: Parser(...).parse_argv(['--core-opt', 'task', '--task-opt']) Bad:: Parser(...).parse_argv(['invoke', '--core-opt', ...]) """ machine = ParseMachine(initial=self.initial, contexts=self.contexts) for token in argv: machine.handle(token) machine.finish() return machine.result
def __init__(self, contexts=(), initial=None): self.initial = initial self.contexts = Lexicon() for context in contexts: debug("Adding %s" % context) if not context.name: raise ValueError("Non-initial contexts must have names.") exists = "A context named/aliased %r is already in this parser!" if context.name in self.contexts: raise ValueError(exists % context.name) self.contexts[context.name] = context for alias in context.aliases: if alias in self.contexts: raise ValueError(exists % alias) self.contexts.alias(alias, to=context.name)
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if (not direct_parse): self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.read_dep_map(dep_map_fn) self.word2lemma = {} self.first_only = cfg.getboolean('filter', 'first_only')
def __init__(self, contexts=(), initial=None, ignore_unknown=False): self.initial = initial self.contexts = Lexicon() self.ignore_unknown = ignore_unknown for context in contexts: debug("Adding {0}".format(context)) if not context.name: raise ValueError("Non-initial contexts must have names.") exists = "A context named/aliased {0!r} is already in this parser!" if context.name in self.contexts: raise ValueError(exists.format(context.name)) self.contexts[context.name] = context for alias in context.aliases: if alias in self.contexts: raise ValueError(exists.format(alias)) self.contexts.alias(alias, to=context.name)
def __init__(self, cfg, direct_parse=False): self.cfg = cfg self.lang = self.cfg.get("deps", "lang") if(not direct_parse): self.out_fn = self.cfg.get("machine", "definitions_binary_out") ensure_dir(os.path.dirname(self.out_fn)) self.dependency_processor = DependencyProcessor(self.cfg) dep_map_fn = cfg.get("deps", "dep_map") self.undefined = set() self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get("machine", "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.read_dep_map(dep_map_fn) self.word2lemma = {} self.first_n = cfg.getint('filter', 'first_n') self.graph_dir = self.cfg.get('machine', 'graph_dir') ensure_dir(self.graph_dir)
def __init__(self, cfg, cfg_section='word_sim'): self.batch = cfg.getboolean(cfg_section, 'batch') logging.warning("fourlangpath is {0}".format( cfg.get(cfg_section, 'fourlangpath'))) self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english')) self.sim_feats = SimFeatures(cfg, cfg_section) self.expand = cfg.getboolean(cfg_section, "expand") logging.info("expand is {0}".format(self.expand))
def __init__(self, cfg, cfg_section="word_sim"): try: self.batch = cfg.getboolean(cfg_section, "batch") except NoSectionError: self.batch = False self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words("english")) self.expand = cfg.getboolean(cfg_section, "expand") logging.info("expand is {0}".format(self.expand))
def make_lexicon(self): lexicon = Lexicon() the = Lexical("ART", {"ROOT": "?the", "AGR": "?v"}) the.set_variable("?the", ["the"]) the.set_variable("?v", ["3s", "3p"]) lexicon.add_word("the", [the]) dog = Lexical("N", {"ROOT": "?dog1", "AGR": "?3s"}) dog.set_variable("?3s", ["3s"]) dog.set_variable("?dog1", ["DOG1"]) lexicon.add_word("dog", [dog]) return lexicon
def __init__(self, kind, name, actionName, fields, radius, cutoff, options): self.name = name self.kind = kind self.actionName = actionName self.fields = fields self.radius = int(radius) self.cutoff = int(cutoff) self.options = options if kind == "lex": if self.options != {}: sys.stderr.write("Lexicon features do not yet support options") sys.exit(-1) self.lexicon = Lexicon(actionName) elif kind in ("token", "sentence"): if actionName not in features.__dict__: sys.stderr.write("Unknown operator named " + actionName + "\n") sys.exit(-1) self.function = features.__dict__[actionName] else: assert False
def makeGold(world): english = { "baby": "baby", "bigbird": "bird", "bird": "bird", "books": "book", "bunnyrabbit": "bunny", "cows": "cow", "moocows": "cow", "duckie": "duck", "hand": "hand", "kitty": "kitty", "kittycats": "kitty", "lambie": "lamb", "pig": "pig", "piggies": "pig", "ring": "ring", "sheep": "sheep", "birdie": "duck", "bear": "bear", "bigbirds": "bird", "book": "book", "cow": "cow", "moocow": "cow", "duck": "duck", "eyes": "eyes", "hat": "hat", "kittycat": "kitty", "lamb": "lamb", "mirror": "mirror", "piggie": "pig", "rattle": "rattle", "rings": "ring", "bunnies": "bunny", "bird": "duck" } gold = Lexicon() for key in english: gold.words.append(world.words_key.index(key)) gold.objects.append(world.objects_key.index(english[key])) return gold
def _load_lexicon( self, src_vocab: List[str], trg_vocab: List[str], align_model: str, threshold: float = 0.0, include_special_tokens: bool = False, ) -> Lexicon: lexicon = Lexicon() model_path = self.model_dir / f"src_trg_{align_model}.t{self.file_suffix}" for line in load_corpus(model_path): src_index_str, trg_index_str, prob_str = line.split(maxsplit=3) src_index = int(src_index_str) trg_index = int(trg_index_str) if include_special_tokens or (src_index > 1 and trg_index > 1): src_word = src_vocab[src_index] trg_word = trg_vocab[trg_index] prob = float(prob_str) if prob > threshold: lexicon[src_word, trg_word] = prob return lexicon
def __init__(self, cfg, cfg_section='word_sim'): self.batch = cfg.getboolean(cfg_section, 'batch') logging.warning("fourlangpath is {0}".format( cfg.get(cfg_section, 'fourlangpath'))) self.cfg = cfg self.graph_dir = cfg.get(cfg_section, "graph_dir") ensure_dir(self.graph_dir) self.lemmatizer = Lemmatizer(cfg) self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary") self.lexicon = Lexicon.load_from_binary(self.lexicon_fn) self.defined_words = self.lexicon.get_words() self.word_sim_cache = {} self.lemma_sim_cache = {} self.links_nodes_cache = {} self.stopwords = set(nltk_stopwords.words('english')) self.sim_feats = SimFeatures(cfg, cfg_section, self.lexicon) self.expand = cfg.getboolean(cfg_section, "expand") compositional = cfg.getboolean('similarity', 'compositional') if compositional is True: self.text_to_4lang = TextTo4lang(cfg, direct_parse=True) logging.info("expand is {0}".format(self.expand)) self.allow_4lang = cfg.getboolean('machine', 'allow_4lang')
def argspec(self): spec = inspect.getargspec(self.body) # Associate default values with their respective arg names if spec.defaults is not None: ret = Lexicon(zip(spec.args[-len(spec.defaults):], spec.defaults)) else: ret = Lexicon() # Pull in args that have no default values ret.update((x, None) for x in spec.args if x not in ret) # Handle auto short flags if self.auto_shortflags: for name in ret: alias = None for char in name: if not (char == name or char in ret): alias = char break if alias: ret.alias(alias, to=name) return ret
for tid in tid_list: token_count[tid] = token_count.get(tid, 0) + 1 tid_list = token_count.keys() tid_list.sort() output = [] prev_tid = 0 for tid in tid_list: tid_set.add(tid) output.append(tid - prev_tid) output.append(token_count[tid]) prev_tid = tid return output lexicon = Lexicon() #limit = 30000 limit = 330071 rand = Random() (_, tmp_file_name) = tempfile.mkstemp() with open(tmp_file_name, 'w') as tmp_file: with open('data/parsed_reviews.json') as reviews_file: progress = Progress('Create buckets', limit) buckets = [[] for i in xrange(2)] for (idx, line) in enumerate(reviews_file): progress.Update() if idx >= limit: break obj = json.loads(line)
def attributes_work(self): lex = Lexicon() lex.foo = 'bar' eq_(lex['foo'], lex.foo)
def aliased_real_attributes_do_not_override_real_attributes(self): lex = Lexicon() lex.alias('get', to='notget') lex.notget = 'value' assert callable(lex.get) assert lex.get != 'value'
def __init__(self): self.tasks = Lexicon() self.default = None
def aliases_appear_in_attributes(self): lex = Lexicon() lex.alias('foo', to='bar') lex.foo = 'value' assert lex.foo == lex.bar == lex['foo'] == lex['bar'] == 'value'
def aliases_work(self): lex = Lexicon() lex.alias('foo', to='bar') lex['bar'] = 'value' assert lex['foo'] == lex['bar'] == 'value'
def aliases_work(self): l = Lexicon() l.alias('foo', to='bar') l['bar'] = 'value' assert l['foo'] == l['bar'] == 'value'
def attributes_work(self): l = Lexicon() l.foo = 'bar' eq_(l['foo'], l.foo)
def aliases_appear_in_attributes(self): l = Lexicon() l.alias('foo', to='bar') l.foo = 'value' assert l.foo == l.bar == l['foo'] == l['bar'] == 'value'