Example #1
0
    def __init__(self, name=None, aliases=(), args=(), vararg=None):
        """
        Create a new ``ParserContext`` named ``name``, with ``aliases``.

        ``name`` is optional, and should be a string if given. It's used to
        tell ParserContext objects apart, and for use in a Parser when
        determining what chunk of input might belong to a given ParserContext.

        ``aliases`` is also optional and should be an iterable containing
        strings. Parsing will honor any aliases when trying to "find" a given
        context in its input.

        May give one or more ``args``, which is a quick alternative to calling
        ``for arg in args: self.add_arg(arg)`` after initialization.
        """
        self.args = Lexicon()
        self.positional_args = []
        self.flags = Lexicon()
        self.inverse_flags = {}  # No need for Lexicon here
        self.name = name
        self.aliases = aliases
        for arg in args:
            self.add_arg(arg)
        if vararg:
            self.varargs = []
        else:
            self.varargs = None
Example #2
0
def __main__():
    trainingCorpus = ParsedConllFile(keepMalformed=False, projectivize=True)

    trainingCorpus.read( \
        open(trainingFile, 'r',
            encoding='utf-8').read())

    # make fake model params, enough for lexicon builder
    # we still need feature_maps to use ParserState
    modelParams = ModelParameters('')
    modelParams.trainingFile = trainingFile
    modelParams.cfg = {'projectivizeTrainingSet': True}

    lexicon = Lexicon(modelParams)
    lexicon.compute()

    sentence = trainingCorpus.sentences[0]

    parser_state = ParserState(sentence, lexicon.getFeatureMaps())

    # necessary for initializing and pushing root
    # (only initialize transition_state_class once!)
    # keep arc_state in sync with parser_state
    arc_state = transition_state_class(parser_state)

    dynamicOracleTrainTest(parser_state)
Example #3
0
class Collection(object):
    def __init__(self):
        self.tasks = Lexicon()
        self.default = None

    def add_task(self, name, task, aliases=(), default=False):
        """
        Adds callable object ``task`` to this collection under name ``name``.

        If ``aliases`` is given, will be used to set up additional aliases for
        this task.

        ``default`` may be set to ``True`` to set the task as this collection's
        default invocation.
        """
        self.tasks[name] = task
        for alias in aliases:
            self.tasks.alias(alias, to=name)
        if default:
            if self.default:
                msg = "'%s' cannot be the default because '%s' already is!"
                raise ValueError(msg % (name, self.default))
            self.default = name

    def __getitem__(self, name=None):
        """
        Returns task named ``name``. Honors aliases.

        If this collection has a default task, it is returned when ``name`` is
        empty or ``None``. If empty input is given and no task has been
        selected as the default, ValueError will be raised.
        """
        if not name:
            if self.default:
                return self[self.default]
            else:
                raise ValueError("This collection has no default task.")
        return self.tasks[name]


    def to_contexts(self):
        """
        Returns all contained tasks and subtasks as a list of parser contexts.
        """
        result = []
        for name, task in self.tasks.iteritems():
            context = Context(name=name, aliases=task.aliases)
            argspec = task.argspec
            for name, default in argspec.iteritems():
                # Handle arg options
                opts = {}
                if default is not None:
                    opts['kind'] = type(default)
                # Handle aliases (auto shortflags, etc)
                names = [name]
                names.extend(argspec.aliases_of(name))
                # Create/add the argument
                context.add_arg(names=names, **opts)
            result.append(context)
        return result
Example #4
0
 def dir_only_shows_real_keys(self):
     "dir() only shows real keys-as-attrs, not aliases"
     a = Lexicon({'key1': 'val1', 'key2': 'val2'})
     a.alias('myalias', 'key1')
     assert 'key1' in dir(a)
     assert 'key2' in dir(a)
     assert 'myalias' not in dir(a)
Example #5
0
    def __init__(self, nombre=None, alias=(), args=()):
        """
        Crea un nuevo `` AnalizadorDeContexto llamado ``nombre``, 
        con ``alias``.

        ``nombre`` es opcional y debería ser una cadena si se proporciona.
        Se usa para diferenciar los objetos AnalizadorDeContexto, y para
        usarlos en un Analizador al determinar qué porción de entrada podría
        pertenecer a un AnalizadorDeContexto dado.

        ``alias`` también es opcional y debería ser un iterable que contenga
        cadenas. El análisis respetará cualquier alias cuando intente 
        "encontrar" un contexto dado en su entrada.

        Puede dar uno o más ``args``, que es una alternativa rápida a llamar a
        ``para arg en args: self.agregar_arg (arg)`` después de la inicialización.

        """
        self.args = Lexicon()
        self.args_posicionales = []
        self.banderas = Lexicon()
        self.banderas_inversas = {}  # No need for Lexicon here
        self.nombre = nombre
        self.alias = alias
        for arg in args:
            self.agregar_arg(arg)
Example #6
0
 def getLexicons(self, l_type='Senti'):
     lexicon = Lexicon()
     if l_type == 'Senti':
         lexicon.getLexiconsFromSentiment()
     elif l_type == 'PN':
         lexicon.getLexiconsFromPN()
     self.lexicons = lexicon.lexicons
Example #7
0
class InvertedIndex():

    def __init__(self):
        self.invertedindex = {}
        self.lexicon = Lexicon()
        self.tokenizer = Tokenizer()
        self.doc_reader = DocReader()
        self.build_index()

    def build_index(self):
        #comments?
        cache = self.doc_reader.get_cache()
        docs = self.doc_reader.read_docs(cache)
        print "\nINVERTEDINDEX : Indexing %d documents..\n" % len(docs)
        for d in range(len(docs)):
            print "Indexing document '%s'" % (settings.PATH_DOCS + str(d))
            self.add_document(docs[d], d)

        print "Indexed total %d unique terms" % self.lexicon.size()

    def get_postinglist(self, lex_id):
        return self.invertedindex[lex_id]
            
    def add_document(self, doc, document_id):
        """FIXME: 
        -Needs doc 
        -Too slow?
        -Remove stop words
        -Reduce number of tokens
        """
        tokens = self.tokenizer.tokenize(doc)
        
        for t in tokens:
            lex_id = self.lexicon.lookup(t.get_value())

            if(lex_id == settings.INVALID):
                lex_id = self.lexicon.add_value(t.get_value())
                pl = PostingList()
                pl.append_posting(Posting(document_id, t.get_position()))
                self.invertedindex[lex_id] = pl
            else:
                pl = self.get_postinglist(lex_id)
    
            if pl.get_last_posting().get_document_id() != document_id:
                pl.append_posting(Posting(document_id, t.get_position()))
            else:
                p = pl.get_last_posting()
                p.append_position(t.get_position())
           
    def size(self):
        return len(self.invertedindex)

    def debugprint(self):
        voc = self.lexicon.get_vocabulary()
        for v in voc:
            lid = self.lexicon.lookup(v)
            pl = self.get_postinglist(lid)
            print "[%s]" % v
            pl.info()
Example #8
0
def create_lexicon():
	lexicon = Lexicon()
	words_file = open(ENGLISH_WORDS, 'r')
	print '=== CREATING LEXICON ==='
	print '=== READING WORDS FILE ==='
	for word in words_file:
		lexicon.add_word(word)
	print '=== LEXICON CREATED ==='
	return lexicon
Example #9
0
def get_ccg_lexicon():
    lexicon = Lexicon()
    filename = os.path.join(LEXICON_DIR, 'geo-lexicon.txt')
    entries = []
    with open(filename) as f:
        for line in f:
            x, y = line.strip().split(' :- NP : ')
            entries.append((x, y))
    lexicon.add_entries(entries, False)
    return lexicon
Example #10
0
def get_ccg_lexicon():
  lexicon = Lexicon()
  filename = os.path.join(DB_DIR, 'lexicon.txt')
  entries = []
  with open(filename) as f:
    for line in f:
      x, y = line.strip().split(' :- NP : ')
      y = y.replace(':', ':_')
      entries.append((x, y))
  lexicon.add_entries(entries)
  return lexicon
Example #11
0
def get_ccg_lexicon():
    lexicon = Lexicon()
    filename = os.path.join(DB_DIR, 'lexicon.txt')
    entries = []
    with open(filename) as f:
        for line in f:
            x, y = line.strip().split(' :- NP : ')
            y = y.replace(':', ':_')
            entries.append((x, y))
    lexicon.add_entries(entries)
    return lexicon
Example #12
0
 def test_is_prefix(self):
     lexicon = Lexicon()
     lexicon.add_words("CAT")
     self.assertTrue(lexicon.is_prefix("CAT"))
     self.assertTrue(lexicon.is_prefix("CA"))
     self.assertFalse(lexicon.is_prefix("DOG"))
     self.assertFalse(lexicon.is_prefix(""))
Example #13
0
 def train(self, train_trees):
     self.lexicon = Lexicon(train_trees)
     self.known_parses = {}
     self.span_to_categories = {}
     for train_tree in train_trees:
         tags = train_tree.get_preterminal_yield()
         tags = tuple(tags)  # because lists are not hashable, but tuples are
         if tags not in self.known_parses:
             self.known_parses[tags] = {}
         if train_tree not in self.known_parses[tags]:
             self.known_parses[tags][train_tree] = 1
         else:
             self.known_parses[tags][train_tree] += 1
         self.tally_spans(train_tree, 0)
Example #14
0
	def __init__(self, code = "zxx"):
		"""
		Create a lect object.
		A I{lect} is language variety; it can either be a spoken or a written form, and a colloquial, mediatic or standard form, and so on.

		It wraps serialization and high-level features.

		It contains three independent internal members:
			- L{lexicon<lexicon>}
			- L{grammar<grammar>}
			- L{inflections<inflection>}

		@type code: str
		@param code:
			A language code according to U{ISO<http://www.iso.org>} standard.

			For the language codes, refer to 639-3 specifications.

			A country/variety code and a representation system might be added: C{eng-US}, C{esp:ERG}, C{por-BR:IPA}
		"""
		self.code = code
		self.name = u""
		self.english_name = ""
		self.__p_o_s = ()
		self.__lemma_categories = {}
		self.__categories = {}
		self.grammar = Grammar(code)
		self.lexicon = Lexicon()
		self.inflections = Inflections()
		self.properties = {"separator" : " ", "capitalization" : "3"} #Lexical and Initials
Example #15
0
 def __init__(self, contexts=(), initial=None, ignore_unknown=False):
     self.initial = initial
     self.contexts = Lexicon()
     self.ignore_unknown = ignore_unknown
     for context in contexts:
         debug("Adding {}".format(context))
         if not context.name:
             raise ValueError("Non-initial contexts must have names.")
         exists = "A context named/aliased {!r} is already in this parser!"
         if context.name in self.contexts:
             raise ValueError(exists.format(context.name))
         self.contexts[context.name] = context
         for alias in context.aliases:
             if alias in self.contexts:
                 raise ValueError(exists.format(alias))
             self.contexts.alias(alias, to=context.name)
Example #16
0
 def ensure_deepcopy_works(self):
     l = Lexicon()
     l['foo'] = 'bar'
     eq_(l.foo, 'bar')
     l2 = copy.deepcopy(l)
     l2.foo = 'biz'
     assert l2.foo != l.foo
Example #17
0
 def ensure_deepcopy_works(self):
     lex = Lexicon()
     lex['foo'] = 'bar'
     eq_(lex.foo, 'bar')
     lex2 = copy.deepcopy(lex)
     lex2.foo = 'biz'
     assert lex2.foo != lex.foo
Example #18
0
 def extract_lexicon(self, out_file_path: Path, threshold: float = 0.0) -> None:
     src_vocab = self._load_vocab("src")
     trg_vocab = self._load_vocab("trg")
     direct_lexicon = self._load_lexicon(src_vocab, trg_vocab, "invswm", threshold=threshold)
     inverse_lexicon = self._load_lexicon(trg_vocab, src_vocab, "swm", threshold=threshold)
     lexicon = Lexicon.symmetrize(direct_lexicon, inverse_lexicon, threshold=threshold)
     lexicon.write(out_file_path)
Example #19
0
 def __init__(self, contextos=(), inicial=None, ignorar_desconocido=False):
     self.inicial = inicial
     self.contextos = Lexicon()
     self.ignorar_desconocido = ignorar_desconocido
     for contexto in contextos:
         debug("Añadiendo {}".format(contexto))
         if not contexto.nombre:
             raise ValueError(
                 "Los contextos no-iniciales deben tener nombres.")
         exists = "Un contexto llamado/alias {!r} ya esta en este analizador!"
         if contexto.nombre in self.contextos:
             raise ValueError(exists.format(contexto.nombre))
         self.contextos[contexto.nombre] = contexto
         for alias in contexto.alias:
             if alias in self.contextos:
                 raise ValueError(exists.format(alias))
             self.contextos.alias(alias, to=contexto.nombre)
Example #20
0
def read_Tischendorf_WH_Matthew_compare_them():
    lexicon = Lexicon()
    tischrd = read_AccentedTischendorf_MT()
    ma = ManualAnalyses("./manual_analyses.txt")
    #whrd = read_WH_writeMQL()
    whrd = read_WH_MT();
    #trstephrd = read_Stephanus()
    #byzrd = read_Byzantine()
    #lexicon = byzrd.produceLexicon(lexicon)
    #lexicon = trstephrd.produceLexicon(lexicon)
    whrd.compareTischendorf(tischrd, lexicon, ma)
    tischrd.applyMappings()    
    tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA)
    lexicon = whrd.lexicon
    lexicon.writeLexicon("lexicon_nonunique.txt", False)
    tischlexicon = Lexicon()
    tischrd.produceLexicon(tischlexicon).writeLexicon("tischlexicon_nonunique.txt", False)
    return tischrd
Example #21
0
def read_Tischendorf_WH_Matthew_compare_them():
    lexicon = Lexicon()
    tischrd = read_AccentedTischendorf_MT()
    ma = ManualAnalyses("./manual_analyses.txt")
    #whrd = read_WH_writeMQL()
    whrd = read_WH_MT();
    #trstephrd = read_Stephanus()
    #byzrd = read_Byzantine()
    #lexicon = byzrd.produceLexicon(lexicon)
    #lexicon = trstephrd.produceLexicon(lexicon)
    whrd.compareTischendorf(tischrd, lexicon, ma)
    tischrd.applyMappings()    
    tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA)
    lexicon = whrd.lexicon
    lexicon.writeLexicon("lexicon_nonunique.txt", False)
    tischlexicon = Lexicon()
    tischrd.produceLexicon(tischlexicon).writeLexicon("tischlexicon_nonunique.txt", False)
    return tischrd
def process_xml_wiki(xmlf):
    process_pool = Pool(processes=PROCESS_AMOUNT)
    lexicon_db = Lexicon()
    with click.progressbar(
            process_pool.map(extract_mention_links_categories,
                             extract_page(xmlf)),
            label='obtaining mentions and links') as mention_link_progress_bar:
        for source_uri, lexicon, links, categories_list in mention_link_progress_bar:
            lexicon_db.insert_categories_uri(source_uri, categories_list)
            lexicon_db.insert_links_uri(source_uri, links)
            lexicon_db.insert_mentions_uris(lexicon)
Example #23
0
def main(args):
    lexicon = Lexicon()
    lexicon.add_words("CAR", "CAT", "CARD", "CART")
    lexicon2 = Lexicon()
    lexicon2.add_words("CAT")
    print(lexicon2)
    grid = [["A", "A", "R"], ["T", "C", "D"]]
    words = list(search_grid(grid, lexicon))
    # Reversing so it prints results out in the order shown in the pdf
    words.reverse()
    print(words)
Example #24
0
def get_lexicon_from_raw_lexicon_then_write(basename, newname):
    filename = os.path.join(LEXICON_DIR, basename)
    newfilename = os.path.join(LEXICON_DIR, newname)
    lex = Lexicon()
    entries = []
    with open(filename) as f:
        for line in f:
            lexicon_tuple = parse_entry(line)
            name = lexicon_tuple[0]
            entity = normalize_entity(lexicon_tuple[1])
            if entity == '':
                continue
            entries.append((name, entity))
    lex.add_entries(entries, False)
    with open(newfilename, 'w') as f:
        for name, entity in lex.entries.items():
            #print('%s :- NP : %s' % (name, entity), file=f)
            pass
    return lex
Example #25
0
 def __init__(self, kind, name, actionName, fields, radius, cutoff,
              options):
     self.name = name
     self.kind = kind
     self.actionName = actionName
     self.fields = fields
     self.radius = int(radius)
     self.cutoff = int(cutoff)
     self.options = options
     if kind == "lex":
         if self.options != {}:
             sys.stderr.write('Lexicon features do not yet support options')
             sys.exit(-1)
         self.lexicon = Lexicon(actionName)
     elif kind in ("token", "sentence"):
         if actionName not in features.__dict__:
             sys.stderr.write("Unknown operator named " + actionName + "\n")
             sys.exit(-1)
         self.function = features.__dict__[actionName]
     else:
         assert False
Example #26
0
def read_Tischendorf_WH_compare_them():
    lexicon = Lexicon()
    tischrd = read_AccentedTischendorf()
    ma = ManualAnalyses("./manual_analyses.txt")
    #whrd = read_WH_writeMQL()
    whrd = read_WH()
    #trstephrd = read_Stephanus()
    #byzrd = read_Byzantine()
    #lexicon = byzrd.produceLexicon(lexicon)
    #lexicon = trstephrd.produceLexicon(lexicon)
    whrd.compareTischendorf(tischrd, lexicon, ma)
    tischrd.applyMappings()
    tischrd.applyANLEXLemmaDictionary(
        "../text/lemmas/nonstrongs_forms_tags_lemmas.txt")
    tischrd.writeBooks_MORPH_style(tisch_out_basedir, "TSP", kind.kBETA)
    lexicon = whrd.lexicon
    lexicon.writeLexicon("lexicon_nonunique.txt", False)
    tischlexicon = Lexicon()
    tischrd.produceLexicon(tischlexicon).writeLexicon(
        "tischlexicon_nonunique.txt", False)
    return tischrd
Example #27
0
    def __init__(self, name=None, aliases=(), args=()):
        """
        Create a new ``Context`` named ``name``, with ``aliases``.

        ``name`` is optional, and should be a string if given. It's used to
        tell Context objects apart, and for use in a Parser when determining
        what chunk of input might belong to a given Context.

        ``aliases`` is also optional and should be an iterable containing
        strings. Parsing will honor any aliases when trying to "find" a given
        context in its input.

        May give one or more ``args``, which is a quick alternative to calling
        ``for arg in args: self.add_arg(arg)`` after initialization.
        """
        self.args = Lexicon()
        self.flags = Lexicon()
        self.name = name
        self.aliases = aliases
        for arg in args:
            self.add_arg(arg)
Example #28
0
def training():
    """takes the parallel corpus and sends it to lexicon"""
    filename = sys.argv[1]  # todo: look at "sys.argv" in the doc
    my_lexicon = Lexicon(filename)  # creates the Lexicon object "my_lexicon"
    my_lexicon.readfile()  # creates large lexicon from parallel file
    my_lexicon.minimize()  # picks most frequent word as value
    return my_lexicon
Example #29
0
File: spec.py Project: savkov/sling
    def build(self, commons_path, corpora_path):
        # Prepare lexical dictionaries.
        self.words = Lexicon(self.words_normalize_digits)
        self.suffix = Lexicon(self.words_normalize_digits, oov_item=None)

        # Initialize training corpus.
        corpora = Corpora(corpora_path, commons_path)

        # Collect word and affix lexicons.
        for document in corpora:
            for token in document.tokens:
                word = token.word
                self.words.add(word)
                for s in self.get_suffixes(word):
                    assert type(s) is str
                    self.suffix.add(s)
        print "Words:", self.words.size(), "items in lexicon, including OOV"
        print "Suffix:", self.suffix.size(), "items in lexicon"

        # Load common store, but not freeze it yet. We will add the action table
        # and cascade specification to it.
        self.commons_path = commons_path
        self.commons = sling.Store()
        self.commons.load(commons_path)
        schema = sling.DocumentSchema(self.commons)

        # Prepare action table and cascade.
        self._build_action_table(corpora)
        self.cascade = cascade.ShiftMarkCascade(self.actions)
        print self.cascade

        # Save cascade specification in commons.
        _ = self.cascade.as_frame(self.commons,
                                  delegate_cell_prefix="delegate")

        # Freeze the common store.
        self.commons.freeze()

        # Add feature specs.
        self._specify_features()
Example #30
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.out_fn = self.cfg.get("machine", "definitions_binary_out")
     ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.word2lemma = {}
Example #31
0
 def __init__(self, cfg):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     self.out_fn = self.cfg.get("machine", "definitions_binary_out")
     ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.read_dep_map(dep_map_fn)
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.word2lemma = {}
Example #32
0
class Collection(object):
    def __init__(self):
        self.tasks = Lexicon()
        self.default = None

    def add_task(self, name, task, aliases=(), default=False):
        """
        Adds callable object ``task`` to this collection under name ``name``.

        If ``aliases`` is given, will be used to set up additional aliases for
        this task.

        ``default`` may be set to ``True`` to set the task as this collection's
        default invocation.
        """
        self.tasks[name] = task
        for alias in aliases:
            self.tasks.alias(alias, to=name)
        if default:
            if self.default:
                msg = "'%s' cannot be the default because '%s' already is!"
                raise ValueError(msg % (name, self.default))
            self.default = name

    def get(self, name=None):
        """
        Returns task named ``name``. Honors aliases.

        If this collection has a default task, it is returned when ``name`` is
        empty or ``None``. If empty input is given and no task has been
        selected as the default, ValueError will be raised.
        """
        if not name:
            if self.default:
                return self.get(self.default)
            else:
                raise ValueError("This collection has no default task.")
        return self.tasks[name]
Example #33
0
class Parser(object):
    def __init__(self, contexts=(), initial=None):
        self.initial = initial
        self.contexts = Lexicon()
        for context in contexts:
            debug("Adding %s" % context)
            if not context.name:
                raise ValueError("Non-initial contexts must have names.")
            exists = "A context named/aliased %r is already in this parser!"
            if context.name in self.contexts:
                raise ValueError(exists % context.name)
            self.contexts[context.name] = context
            for alias in context.aliases:
                if alias in self.contexts:
                    raise ValueError(exists % alias)
                self.contexts.alias(alias, to=context.name)

    def parse_argv(self, argv):
        """
        Parse an argv-style token list ``argv``.

        Returns a list of ``Context`` objects matching the order they were
        found in the ``argv`` and containing ``Argument`` objects with updated
        values based on any flags given.

        Assumes any program name has already been stripped out. Good::

            Parser(...).parse_argv(['--core-opt', 'task', '--task-opt'])

        Bad::

            Parser(...).parse_argv(['invoke', '--core-opt', ...])
        """
        machine = ParseMachine(initial=self.initial, contexts=self.contexts)
        for token in argv:
            machine.handle(token)
        machine.finish()
        return machine.result
Example #34
0
 def __init__(self, contexts=(), initial=None):
     self.initial = initial
     self.contexts = Lexicon()
     for context in contexts:
         debug("Adding %s" % context)
         if not context.name:
             raise ValueError("Non-initial contexts must have names.")
         exists = "A context named/aliased %r is already in this parser!"
         if context.name in self.contexts:
             raise ValueError(exists % context.name)
         self.contexts[context.name] = context
         for alias in context.aliases:
             if alias in self.contexts:
                 raise ValueError(exists % alias)
             self.contexts.alias(alias, to=context.name)
Example #35
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if (not direct_parse):
         self.out_fn = self.cfg.get("machine", "definitions_binary_out")
         ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.read_dep_map(dep_map_fn)
     self.word2lemma = {}
     self.first_only = cfg.getboolean('filter', 'first_only')
Example #36
0
 def __init__(self, contexts=(), initial=None, ignore_unknown=False):
     self.initial = initial
     self.contexts = Lexicon()
     self.ignore_unknown = ignore_unknown
     for context in contexts:
         debug("Adding {0}".format(context))
         if not context.name:
             raise ValueError("Non-initial contexts must have names.")
         exists = "A context named/aliased {0!r} is already in this parser!"
         if context.name in self.contexts:
             raise ValueError(exists.format(context.name))
         self.contexts[context.name] = context
         for alias in context.aliases:
             if alias in self.contexts:
                 raise ValueError(exists.format(alias))
             self.contexts.alias(alias, to=context.name)
Example #37
0
 def __init__(self, cfg, direct_parse=False):
     self.cfg = cfg
     self.lang = self.cfg.get("deps", "lang")
     if(not direct_parse):
         self.out_fn = self.cfg.get("machine", "definitions_binary_out")
         ensure_dir(os.path.dirname(self.out_fn))
     self.dependency_processor = DependencyProcessor(self.cfg)
     dep_map_fn = cfg.get("deps", "dep_map")
     self.undefined = set()
     self.lemmatizer = Lemmatizer(cfg)
     self.lexicon_fn = self.cfg.get("machine", "definitions_binary")
     self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
     self.read_dep_map(dep_map_fn)
     self.word2lemma = {}
     self.first_n = cfg.getint('filter', 'first_n')
     self.graph_dir = self.cfg.get('machine', 'graph_dir')
     ensure_dir(self.graph_dir)
Example #38
0
    def __init__(self, cfg, cfg_section='word_sim'):
        self.batch = cfg.getboolean(cfg_section, 'batch')

        logging.warning("fourlangpath is {0}".format(
            cfg.get(cfg_section, 'fourlangpath')))
        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
        self.sim_feats = SimFeatures(cfg, cfg_section)
        self.expand = cfg.getboolean(cfg_section, "expand")
        logging.info("expand is {0}".format(self.expand))
Example #39
0
    def __init__(self, cfg, cfg_section="word_sim"):
        try:
            self.batch = cfg.getboolean(cfg_section, "batch")
        except NoSectionError:
            self.batch = False

        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words("english"))
        self.expand = cfg.getboolean(cfg_section, "expand")
        logging.info("expand is {0}".format(self.expand))
Example #40
0
    def make_lexicon(self):
        lexicon = Lexicon()

        the = Lexical("ART", {"ROOT": "?the", "AGR": "?v"})
        the.set_variable("?the", ["the"])
        the.set_variable("?v", ["3s", "3p"])
        lexicon.add_word("the", [the])

        dog = Lexical("N", {"ROOT": "?dog1", "AGR": "?3s"})
        dog.set_variable("?3s", ["3s"])
        dog.set_variable("?dog1", ["DOG1"])
        lexicon.add_word("dog", [dog])

        return lexicon
Example #41
0
 def __init__(self, kind, name, actionName, fields, radius, cutoff, options):
     self.name = name
     self.kind = kind
     self.actionName = actionName
     self.fields = fields
     self.radius = int(radius)
     self.cutoff = int(cutoff)
     self.options = options
     if kind == "lex":
         if self.options != {}:
             sys.stderr.write("Lexicon features do not yet support options")
             sys.exit(-1)
         self.lexicon = Lexicon(actionName)
     elif kind in ("token", "sentence"):
         if actionName not in features.__dict__:
             sys.stderr.write("Unknown operator named " + actionName + "\n")
             sys.exit(-1)
         self.function = features.__dict__[actionName]
     else:
         assert False
Example #42
0
def makeGold(world):
    english = {
        "baby": "baby",
        "bigbird": "bird",
        "bird": "bird",
        "books": "book",
        "bunnyrabbit": "bunny",
        "cows": "cow",
        "moocows": "cow",
        "duckie": "duck",
        "hand": "hand",
        "kitty": "kitty",
        "kittycats": "kitty",
        "lambie": "lamb",
        "pig": "pig",
        "piggies": "pig",
        "ring": "ring",
        "sheep": "sheep",
        "birdie": "duck",
        "bear": "bear",
        "bigbirds": "bird",
        "book": "book",
        "cow": "cow",
        "moocow": "cow",
        "duck": "duck",
        "eyes": "eyes",
        "hat": "hat",
        "kittycat": "kitty",
        "lamb": "lamb",
        "mirror": "mirror",
        "piggie": "pig",
        "rattle": "rattle",
        "rings": "ring",
        "bunnies": "bunny",
        "bird": "duck"
    }
    gold = Lexicon()
    for key in english:
        gold.words.append(world.words_key.index(key))
        gold.objects.append(world.objects_key.index(english[key]))
    return gold
Example #43
0
 def _load_lexicon(
     self,
     src_vocab: List[str],
     trg_vocab: List[str],
     align_model: str,
     threshold: float = 0.0,
     include_special_tokens: bool = False,
 ) -> Lexicon:
     lexicon = Lexicon()
     model_path = self.model_dir / f"src_trg_{align_model}.t{self.file_suffix}"
     for line in load_corpus(model_path):
         src_index_str, trg_index_str, prob_str = line.split(maxsplit=3)
         src_index = int(src_index_str)
         trg_index = int(trg_index_str)
         if include_special_tokens or (src_index > 1 and trg_index > 1):
             src_word = src_vocab[src_index]
             trg_word = trg_vocab[trg_index]
             prob = float(prob_str)
             if prob > threshold:
                 lexicon[src_word, trg_word] = prob
     return lexicon
Example #44
0
    def __init__(self, cfg, cfg_section='word_sim'):
        self.batch = cfg.getboolean(cfg_section, 'batch')

        logging.warning("fourlangpath is {0}".format(
            cfg.get(cfg_section, 'fourlangpath')))
        self.cfg = cfg
        self.graph_dir = cfg.get(cfg_section, "graph_dir")
        ensure_dir(self.graph_dir)
        self.lemmatizer = Lemmatizer(cfg)
        self.lexicon_fn = self.cfg.get(cfg_section, "definitions_binary")
        self.lexicon = Lexicon.load_from_binary(self.lexicon_fn)
        self.defined_words = self.lexicon.get_words()
        self.word_sim_cache = {}
        self.lemma_sim_cache = {}
        self.links_nodes_cache = {}
        self.stopwords = set(nltk_stopwords.words('english'))
        self.sim_feats = SimFeatures(cfg, cfg_section, self.lexicon)
        self.expand = cfg.getboolean(cfg_section, "expand")
        compositional = cfg.getboolean('similarity', 'compositional')
        if compositional is True:
            self.text_to_4lang = TextTo4lang(cfg, direct_parse=True)
        logging.info("expand is {0}".format(self.expand))
        self.allow_4lang = cfg.getboolean('machine', 'allow_4lang')
Example #45
0
 def argspec(self):
     spec = inspect.getargspec(self.body)
     # Associate default values with their respective arg names
     if spec.defaults is not None:
         ret = Lexicon(zip(spec.args[-len(spec.defaults):], spec.defaults))
     else:
         ret = Lexicon()
     # Pull in args that have no default values
     ret.update((x, None) for x in spec.args if x not in ret)
     # Handle auto short flags
     if self.auto_shortflags:
         for name in ret:
             alias = None
             for char in name:
                 if not (char == name or char in ret):
                     alias = char
                     break
             if alias:
                 ret.alias(alias, to=name)
     return ret
Example #46
0
  for tid in tid_list:
    token_count[tid] = token_count.get(tid, 0) + 1

  tid_list = token_count.keys()
  tid_list.sort()

  output = []
  prev_tid = 0
  for tid in tid_list:
    tid_set.add(tid)
    output.append(tid - prev_tid)
    output.append(token_count[tid])
    prev_tid = tid
  return output

lexicon = Lexicon()

#limit = 30000
limit = 330071

rand = Random()
(_, tmp_file_name) = tempfile.mkstemp()
with open(tmp_file_name, 'w') as tmp_file:
  with open('data/parsed_reviews.json') as reviews_file:
    progress = Progress('Create buckets', limit)
    buckets = [[] for i in xrange(2)]
    for (idx, line) in enumerate(reviews_file):
      progress.Update()
      if idx >= limit:
        break
      obj = json.loads(line)
Example #47
0
 def attributes_work(self):
     lex = Lexicon()
     lex.foo = 'bar'
     eq_(lex['foo'], lex.foo)
Example #48
0
 def aliased_real_attributes_do_not_override_real_attributes(self):
     lex = Lexicon()
     lex.alias('get', to='notget')
     lex.notget = 'value'
     assert callable(lex.get)
     assert lex.get != 'value'
Example #49
0
 def __init__(self):
     self.tasks = Lexicon()
     self.default = None
Example #50
0
 def aliases_appear_in_attributes(self):
     lex = Lexicon()
     lex.alias('foo', to='bar')
     lex.foo = 'value'
     assert lex.foo == lex.bar == lex['foo'] == lex['bar'] == 'value'
Example #51
0
 def aliases_work(self):
     lex = Lexicon()
     lex.alias('foo', to='bar')
     lex['bar'] = 'value'
     assert lex['foo'] == lex['bar'] == 'value'
Example #52
0
 def aliases_work(self):
     l = Lexicon()
     l.alias('foo', to='bar')
     l['bar'] = 'value'
     assert l['foo'] == l['bar'] == 'value'
Example #53
0
 def attributes_work(self):
     l = Lexicon()
     l.foo = 'bar'
     eq_(l['foo'], l.foo)
Example #54
0
 def aliases_appear_in_attributes(self):
     l = Lexicon()
     l.alias('foo', to='bar')
     l.foo = 'value'
     assert l.foo == l.bar == l['foo'] == l['bar'] == 'value'