Example #1
0
def parseSentence(toks, grammarWoTerm, cfg):
  if cfg:
    gr = grammar.parse_cfg(grammarWoTerm)
    parser = parse.BottomUpChartParser(gr)
  else:
    termRules=[]
    for kr in toks:
      termRules.append(kr2terminals.getRuleFromKr(kr))
    
    fullGrammar = '\n'.join(grammarWoTerm+termRules)
  
    gr = grammar.parse_fcfg(fullGrammar) 
    parser = parse.FeatureBottomUpChartParser(gr)
  
  
  chart = parser.chart_parse(toks)
  return chart  
Example #2
0
def main(grammarFile='', text='', serialize=None, uncrossing=False,
         countCrosses=False, max_maxNpTag=False, b_baseNpTag=False,
         findWrongChunks=False, useCfg=False):
    sys.stderr.write('loading nltk...')
    corp = splitCorp(text)
    cLength=len(corp)
    grammarText = [l.strip('\n') for l in file(grammarFile)]
    grammarText += getTerminals(corp)
    grammarObj = grammar.parse_fcfg(grammarText)
    parser = parse.FeatureBottomUpChartParser(grammarObj)
    sys.stderr.write('done!\nparsing...')    
    
    if serialize:
        parseOut = open(serialize, 'w')
    
    allCrosses=0.0
    allSens=0.0
    for c,sen in enumerate(corp):
        allSens+=1
        if c*100/cLength>(c-1)*100/cLength: 
            sys.stderr.write(str(c*100/cLength)+'% ')
        toks = [l.strip('\n').split()[1] for l in sen]
        words = [l.strip('\n').split()[0] for l in sen]
        chart = parser.chart_parse(toks)
        if max_maxNpTag:
            tagMaxNPs(chart, sen, useCfg)
            print
        if b_baseNpTag:
            tagBaseNPs(chart, sen)
            print
        if findWrongChunks:
            getWrongChunks(chart, sen)
        if serialize:
            serializeParse(chart, toks, words, parseOut)
        if uncrossing:
            findUncrossingNPs(chart, words, useCfg)
        if countCrosses:    
            allCrosses+=findUncrossingNPs(chart, words, useCfg, countCrosses)
    sys.stderr.write('done\n')
    if countCrosses:
        print 'No. of sentences:', str(allSens)
        print 'No. of crosses:', str(allCrosses)
        print 'Average no. of crosses per sentence:',
        print allCrosses/allSens
Example #3
0
File: util.py Project: gijs/nltk
def demo_legacy_grammar():
    """
    Check that batch_interpret() is compatible with legacy grammars that use
    a lowercase 'sem' feature.
    
    Define 'test.fcfg' to be the following
    
    """
    from nltk.grammar import parse_fcfg

    g = parse_fcfg("""
    % start S
    S[sem=<hello>] -> 'hello'
    """)
    print "Reading grammar: %s" % g
    print "*" * 20
    for reading in batch_interpret(['hello'], g, semkey='sem'):
        syn, sem = reading[0]
        print 
        print "output: ", sem     
Example #4
0
def demo_legacy_grammar():
    """
    Check that batch_interpret() is compatible with legacy grammars that use
    a lowercase 'sem' feature.

    Define 'test.fcfg' to be the following

    """
    from nltk.grammar import parse_fcfg

    g = parse_fcfg("""
    % start S
    S[sem=<hello>] -> 'hello'
    """)
    print("Reading grammar: %s" % g)
    print("*" * 20)
    for reading in batch_interpret(['hello'], g, semkey='sem'):
        syn, sem = reading[0]
        print()
        print("output: ", sem)
Example #5
0
def demo_grammar():
    from nltk.grammar import parse_fcfg
    return parse_fcfg("""
S  -> NP VP
PP -> Prep NP
NP -> NP PP
VP -> VP PP
VP -> Verb NP
VP -> Verb
NP -> Det[pl=?x] Noun[pl=?x]
NP -> "John"
NP -> "I"
Det -> "the"
Det -> "my"
Det[-pl] -> "a"
Noun[-pl] -> "dog"
Noun[-pl] -> "cookie"
Verb -> "ate"
Verb -> "saw"
Prep -> "with"
Prep -> "under"
""")
Example #6
0
def demo_grammar():
    from nltk.grammar import parse_fcfg
    return parse_fcfg("""
S  -> NP VP
PP -> Prep NP
NP -> NP PP
VP -> VP PP
VP -> Verb NP
VP -> Verb
NP -> Det[pl=?x] Noun[pl=?x]
NP -> "John"
NP -> "I"
Det -> "the"
Det -> "my"
Det[-pl] -> "a"
Noun[-pl] -> "dog"
Noun[-pl] -> "cookie"
Verb -> "ate"
Verb -> "saw"
Prep -> "with"
Prep -> "under"
""")
Example #7
0
def load(resource_url,
         format='auto',
         cache=True,
         verbose=False,
         logic_parser=None,
         fstruct_parser=None):
    """
    Load a given resource from the NLTK data package.  The following
    resource formats are currently supported:
      - C{'pickle'}
      - C{'yaml'}
      - C{'cfg'} (context free grammars)
      - C{'pcfg'} (probabilistic CFGs)
      - C{'fcfg'} (feature-based CFGs)
      - C{'fol'} (formulas of First Order Logic)
      - C{'logic'} (Logical formulas to be parsed by the given logic_parser)
      - C{'val'} (valuation of First Order Logic model)
      - C{'raw'}

    If no format is specified, C{load()} will attempt to determine a
    format based on the resource name's file extension.  If that
    fails, C{load()} will raise a C{ValueError} exception.

    @type resource_url: C{str}
    @param resource_url: A URL specifying where the resource should be
        loaded from.  The default protocol is C{"nltk:"}, which searches
        for the file in the the NLTK data package.
    @type cache: C{bool}
    @param cache: If true, add this resource to a cache.  If C{load}
        finds a resource in its cache, then it will return it from the
        cache rather than loading it.  The cache uses weak references,
        so a resource wil automatically be expunged from the cache
        when no more objects are using it.
        
    @type verbose: C{bool}
    @param verbose: If true, print a message when loading a resource.
        Messages are not displayed when a resource is retrieved from
        the cache.
    
    @type logic_parser: C{LogicParser}
    @param logic_parser: The parser that will be used to parse logical 
    expressions.
    @type fstruct_parser: C{FeatStructParser}
    @param fstruct_parser: The parser that will be used to parse the
    feature structure of an fcfg.
    """
    # If we've cached the resource, then just return it.
    if cache:
        resource_val = _resource_cache.get(resource_url)
        if resource_val is not None:
            if verbose:
                print '<<Using cached copy of %s>>' % (resource_url, )
            return resource_val

    # Let the user know what's going on.
    if verbose:
        print '<<Loading %s>>' % (resource_url, )

    # Determine the format of the resource.
    if format == 'auto':
        resource_url_parts = resource_url.split('.')
        ext = resource_url_parts[-1]
        if ext == 'gz':
            ext = resource_url_parts[-2]
        format = AUTO_FORMATS.get(ext)
        if format is None:
            raise ValueError('Could not determine format for %s based '
                             'on its file\nextension; use the "format" '
                             'argument to specify the format explicitly.' %
                             resource_url)

    # Load the resource.
    if format == 'pickle':
        resource_val = pickle.load(_open(resource_url))
    elif format == 'yaml':
        resource_val = yaml.load(_open(resource_url))
    elif format == 'cfg':
        resource_val = cfg.parse_cfg(_open(resource_url).read())
    elif format == 'pcfg':
        resource_val = cfg.parse_pcfg(_open(resource_url).read())
    elif format == 'fcfg':
        resource_val = cfg.parse_fcfg(_open(resource_url).read(),
                                      logic_parser=logic_parser,
                                      fstruct_parser=fstruct_parser)
    elif format == 'fol':
        resource_val = sem.parse_logic(_open(resource_url).read(),
                                       logic_parser=sem.logic.LogicParser())
    elif format == 'logic':
        resource_val = sem.parse_logic(_open(resource_url).read(),
                                       logic_parser=logic_parser)
    elif format == 'val':
        resource_val = sem.parse_valuation(_open(resource_url).read())
    elif format == 'raw':
        resource_val = _open(resource_url).read()
    else:
        assert format not in FORMATS
        raise ValueError('Unknown format type!')

    # If requested, add it to the cache.
    if cache:
        try:
            _resource_cache[resource_url] = resource_val
        except TypeError:
            # We can't create weak references to some object types, like
            # strings and tuples.  For now, just don't cache them.
            pass

    return resource_val
Example #8
0
def load_grammar(fn):
    g = parse_fcfg(open(fn))
    g.parser = FeatureChartParser(g)
    g.parse = g.parser.parse
    return g