def in_demo(trace=0, sql=True):
    """
    Select pairs of organizations and locations whose mentions occur with an
    intervening occurrence of the preposition "in".

    If the sql parameter is set to True, then the entity pairs are loaded into
    an in-memory database, and subsequently pulled out using an SQL "SELECT"
    query.
    """
    from nltk.corpus import ieer
    if sql:
        try:
            import sqlite3
            connection = sqlite3.connect(":memory:")
            connection.text_factory = sqlite3.OptimizedUnicode
            cur = connection.cursor()
            cur.execute("""create table Locations
            (OrgName text, LocationName text, DocID text)""")
        except ImportError:
            import warnings
            warnings.warn("Cannot import sqlite; sql flag will be ignored.")

    IN = re.compile(r'.*\bin\b(?!\b.+ing)')

    print
    print "IEER: in(ORG, LOC) -- just the clauses:"
    print "=" * 45

    for file in ieer.fileids():
        for doc in ieer.parsed_docs(file):
            if trace:
                print doc.docno
                print "=" * 15
            for rel in extract_rels('ORG',
                                    'LOC',
                                    doc,
                                    corpus='ieer',
                                    pattern=IN):
                print show_clause(rel, relsym='IN')
                if sql:
                    try:
                        rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
                        cur.execute(
                            """insert into Locations 
                                    values (?, ?, ?)""", rtuple)
                        connection.commit()
                    except NameError:
                        pass

    if sql:
        try:
            cur.execute("""select OrgName from Locations
                        where LocationName = 'Atlanta'""")
            print
            print "Extract data from SQL table: ORGs in Atlanta"
            print "-" * 15
            for row in cur:
                print row
        except NameError:
            pass
Exemple #2
0
def rels2rdf(ns, verbose=False):
    """
    Convert the reldicts derived from the IEER corpus in an RDF Graph.
    """
    graph = ConjunctiveGraph()
    graph.bind("nltk", BASE)
    graph.bind("org", "http://nltk.org/terms/org#")
    graph.bind("loc", "http://nltk.org/terms/loc#")
    graph.bind("pred", "http://nltk.org/terms/pred#")
    graph.bind("class", "http://nltk.org/terms/class#")
    in_uri = sym2uri(ns, "pred", "in")
    loc_uri = sym2uri(ns, "class", "Location")
    org_uri = sym2uri(ns, "class", "Organization")
    graph.add((in_uri, RDFNS.type, RDFSNS.Property))
    graph.add((loc_uri, RDFNS.type, RDFSNS.Class))
    graph.add((org_uri, RDFNS.type, RDFSNS.Class))
    graph.add((in_uri, RDFSNS.domain, org_uri))
    graph.add((in_uri, RDFSNS.range, loc_uri))
    from nltk.corpus import ieer

    IN = re.compile(r".*\bin\b(?!\b.+ing\b)")
    for item in ieer.fileids():
        for doc in ieer.parsed_docs(item):
            for reldict in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
                graph.add(make_rdf(ns, reldict, relsym="in"))
                for triple in make_rdfs(ns, reldict):
                    graph.add(triple)
    return graph
Exemple #3
0
def rels2rdf(ns, verbose=False):
    """
    Convert the reldicts derived from the IEER corpus in an RDF Graph.
    """
    graph = ConjunctiveGraph()
    graph.bind('nltk',BASE)
    graph.bind('org', "http://nltk.org/terms/org#")
    graph.bind('loc', "http://nltk.org/terms/loc#")
    graph.bind('pred', "http://nltk.org/terms/pred#")
    graph.bind('class', "http://nltk.org/terms/class#")
    in_uri = sym2uri(ns, 'pred', 'in')
    loc_uri = sym2uri(ns, 'class', 'Location')
    org_uri = sym2uri(ns, 'class', 'Organization')
    graph.add((in_uri, RDFNS.type, RDFSNS.Property))
    graph.add((loc_uri, RDFNS.type, RDFSNS.Class))
    graph.add((org_uri, RDFNS.type, RDFSNS.Class))
    graph.add((in_uri, RDFSNS.domain, org_uri))
    graph.add((in_uri, RDFSNS.range, loc_uri))
    from nltk.corpus import ieer
    IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
    for item in ieer.fileids():
        for doc in ieer.parsed_docs(item):
            for reldict in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
                graph.add(make_rdf(ns, reldict, relsym='in'))
                for triple in make_rdfs(ns, reldict):
                    graph.add(triple)
    return graph
Exemple #4
0
def in_demo(trace=0, sql=True):
    """
    Select pairs of organizations and locations whose mentions occur with an
    intervening occurrence of the preposition "in".

    If the sql parameter is set to True, then the entity pairs are loaded into
    an in-memory database, and subsequently pulled out using an SQL "SELECT"
    query.
    """
    from nltk.corpus import ieer

    if sql:
        try:
            import sqlite3

            connection = sqlite3.connect(":memory:")
            connection.text_factory = sqlite3.OptimizedUnicode
            cur = connection.cursor()
            cur.execute("""create table Locations
            (OrgName text, LocationName text, DocID text)""")
        except ImportError:
            import warnings

            warnings.warn("Cannot import sqlite; sql flag will be ignored.")

    IN = re.compile(r'.*\bin\b(?!\b.+ing)')

    print()
    print("IEER: in(ORG, LOC) -- just the clauses:")
    print("=" * 45)

    for file in ieer.fileids():
        for doc in ieer.parsed_docs(file):
            if trace:
                print(doc.docno)
                print("=" * 15)
            for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
                print(clause(rel, relsym='IN'))
                if sql:
                    try:
                        rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
                        cur.execute("""insert into Locations
                                    values (?, ?, ?)""", rtuple)
                        connection.commit()
                    except NameError:
                        pass

    if sql:
        try:
            cur.execute("""select OrgName from Locations
                        where LocationName = 'Atlanta'""")
            print()
            print("Extract data from SQL table: ORGs in Atlanta")
            print("-" * 15)
            for row in cur:
                print(row)
        except NameError:
            pass
Exemple #5
0
def ieer_headlines():
    from nltk.corpus import ieer

    print("IEER: First 20 Headlines")
    print("=" * 45)

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)
Exemple #6
0
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)
def ieer_headlines():
    
    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print "IEER: First 20 Headlines"
    print "=" * 45
    
    trees = [doc.headline for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print
        print "%s:\n%s" % (doc.docno, tree)
Exemple #8
0
def roles_demo(trace=0):
    from nltk.corpus import ieer

    roles = r"""
    (.*(                   # assorted roles
    analyst|
    chair(wo)?man|
    commissioner|
    counsel|
    director|
    economist|
    editor|
    executive|
    foreman|
    governor|
    head|
    lawyer|
    leader|
    librarian).*)|
    manager|
    partner|
    president|
    producer|
    professor|
    researcher|
    spokes(wo)?man|
    writer|
    ,\sof\sthe?\s*  # "X, of (the) Y"
    """
    ROLES = re.compile(roles, re.VERBOSE)

    print()
    print("IEER: has_role(PER, ORG) -- raw rtuples:")
    print("=" * 45)

    for file in ieer.fileids():
        for doc in ieer.parsed_docs(file):
            lcon = rcon = False
            if trace:
                print(doc.docno)
                print("=" * 15)
                lcon = rcon = True
            for rel in extract_rels("PER",
                                    "ORG",
                                    doc,
                                    corpus="ieer",
                                    pattern=ROLES):
                print(rtuple(rel, lcon=lcon, rcon=rcon))
Exemple #9
0
def roles_demo(trace=0):
    from nltk.corpus import ieer

    roles = """
    (.*(                   # assorted roles
    analyst|
    chair(wo)?man|
    commissioner|
    counsel|
    director|
    economist|       
    editor|
    executive|
    foreman|
    governor|
    head|
    lawyer|
    leader|
    librarian).*)|
    manager|
    partner|
    president|
    producer|
    professor|
    researcher|
    spokes(wo)?man|
    writer|
    ,\sof\sthe?\s*  # "X, of (the) Y"
    """
    ROLES = re.compile(roles, re.VERBOSE)

    print()
    print("IEER: has_role(PER, ORG) -- raw rtuples:")
    print("=" * 45)

    for file in ieer.fileids():
        for doc in ieer.parsed_docs(file):
            lcon = rcon = False
            if trace:
                print(doc.docno)
                print("=" * 15)
                lcon = rcon = True
            for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
                print(rtuple(rel, lcon=lcon, rcon=rcon))
Exemple #10
0
    from nltk.sem import relextract
    pairs = relextract.tree2semi_rel(tree)
    for s, tree in pairs[18:22]:
        print('("...%s", %s)' % (" ".join(s[-5:]), tree))

    reldicts = relextract.semi_rel2reldict(pairs)
    for k, v in sorted(reldicts[0].items()):
        print(k, '=>', v)

#	The function relextract() allows us to filter the reldicts
#	according to the classes of the subject and object named entities.
#	In addition, we can specify that the filler text has to match a given regular expression,
#	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
#	relation, where IN has signature <ORG, LOC>.
    IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
    for fileid in ieer.fileids():
        print fileid
        for doc in ieer.parsed_docs(fileid):
            for rel in relextract.extract_rels('ORG',
                                               'LOC',
                                               doc,
                                               corpus='ieer',
                                               pattern=IN):
                print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

    roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

    ROLES = re.compile(roles, re.VERBOSE)
    for fileid in ieer.fileids():
        for doc in ieer.parsed_docs(fileid):
            for rel in relextract.extract_rels('PER',
Exemple #11
0
def main():
    print "user input(1) or semcor(2)?"

    num = raw_input()

    if num == "1":
        #input
        print "enter word"
        word = raw_input()
        for meaning in (net.synsets(word)):
            #print "Sense: " + re.findall("'.*'", str(meaning))[0]
            print "Sense: " + str(meaning)
            print meaning.definition() + "\n"
            hypernyms = (meaning.hypernyms())
            if len(hypernyms) > 0:
                print "\nHypernyms:"
                for meaning2 in hypernyms:
                    print re.findall("'.*'", str(meaning2))[0]

            hyponyms = (meaning.hyponyms())
            if len(hyponyms) > 0:
                print "\nHyponyms:"
                for meaning2 in hyponyms:
                    print re.findall("'.*'", str(meaning2))[0]

    #		print "\nHypernym Tree:"
    #		print (gethypernymtree(meaning))
            print "\n"

    #		dog = wn.synset('dog.n.01')
    #		hypo = lambda s: s.hyponyms()
    #	 	hyper = lambda s: s.hypernyms()
    #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms()
    #True
    #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()

    elif (num == "2"):
        #semcor
        print "semcor"

        for line in semcor.sents()[0:100]:
            s = ""
            for word in line:
                s = s + " " + word
            print s + "\n"

            for word in line:
                meanings = net.synsets(word)
                if len(meanings) > 0:
                    print meanings[0].definition()
    elif num == "3":

        docs = ieer.parsed_docs('APW_19980424')
        tree = docs[1].text

        from nltk.sem import relextract
        pairs = relextract.tree2semi_rel(tree)
        for s, tree in pairs[18:22]:
            print('("...%s", %s)' % (" ".join(s[-5:]), tree))

        reldicts = relextract.semi_rel2reldict(pairs)
        for k, v in sorted(reldicts[0].items()):
            print(k, '=>', v)

    #	The function relextract() allows us to filter the reldicts
    #	according to the classes of the subject and object named entities.
    #	In addition, we can specify that the filler text has to match a given regular expression,
    #	 as illustrated in the next example. Here, we are looking for pairs of entities in the IN
    #	relation, where IN has signature <ORG, LOC>.
        IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)')
        for fileid in ieer.fileids():
            print fileid
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('ORG',
                                                   'LOC',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=IN):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS

        roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)"

        ROLES = re.compile(roles, re.VERBOSE)
        for fileid in ieer.fileids():
            for doc in ieer.parsed_docs(fileid):
                for rel in relextract.extract_rels('PER',
                                                   'ORG',
                                                   doc,
                                                   corpus='ieer',
                                                   pattern=ROLES):
                    print(relextract.rtuple(rel))  # doctest: +ELLIPSIS
from nltk.corpus import conll2000, conll2002
print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE


# SEMCOR
    
from nltk.corpus import semcor
print(semcor.words())
print(semcor.chunks())
print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
list(map(str, semcor.tagged_chunks(tag='both')[:3]))
[[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]    

# IEER

from nltk.corpus import ieer
ieer.fileids() # doctest: +NORMALIZE_WHITESPACE
docs = ieer.parsed_docs('APW_19980314')
print(docs[0])
print(docs[0].docno)
print(docs[0].doctype)
print(docs[0].date_time)
print(docs[0].headline)
print(docs[0].text) # doctest: +ELLIPSIS
Exemple #13
0
def write_all_to_conll():
    filenames = ieer.fileids()
    for filename in filenames:
        write_conll(filename)
        print 'Done with ', filename
Exemple #14
0
"""
This will convert the portion of the NIST 1999 IE-ER Dataset in NLTK
to CONLL format.

NOTE: unfortunately the IEER corpus has already tokenized the words for us
but not the sentences, so we need to do a bit of pre-processing.

"""

from nltk.corpus import ieer
from nltk.tree import Tree
import nltk

ieer.fileids()
#print(docs[0].text)
CONLL_WRITE = '../data/'


def tree2conll_without_postags(t):
    tags = []
    for child in t:
        try:
            category = child.label()
            if category in {
                    'LOCATION', 'PERSON', 'ORGANIZATION', 'DATE', 'TIME',
                    'DURATION', 'CARDINAL', 'MONEY', 'MEASURE'
            }:
                category = category[0:3]
            elif category == 'PERCENT':
                category = 'PCT'
            else:
Exemple #15
0
print(conll2000.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('conll2002')
print(conll2002.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('semcor')
print(semcor.words())
print(semcor.chunks())
print(semcor.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(semcor.chunk_sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(list(map(str, semcor.tagged_chunks(tag='both')[:3])))
print([[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]])
# nltk.download('ieer')
print(ieer.fileids())  # doctest: +NORMALIZE_WHITESPACE
docs = ieer.parsed_docs('APW_19980314')
print(docs[0])
print(docs[0].docno)
print(docs[0].doctype)
print(docs[0].date_time)
print(docs[0].headline)
print(docs[0].text)  # doctest: +ELLIPSIS
# parsed corpora
print(treebank.fileids())  # doctest: +ELLIPSIS
print(treebank.words('wsj_0003.mrg'))
print(treebank.tagged_words('wsj_0003.mrg'))
print(treebank.parsed_sents('wsj_0003.mrg')
      [0])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('ptb')
print(ptb.fileids())  # doctest: +SKIP