def in_demo(trace=0, sql=True): """ Select pairs of organizations and locations whose mentions occur with an intervening occurrence of the preposition "in". If the sql parameter is set to True, then the entity pairs are loaded into an in-memory database, and subsequently pulled out using an SQL "SELECT" query. """ from nltk.corpus import ieer if sql: try: import sqlite3 connection = sqlite3.connect(":memory:") connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() cur.execute("""create table Locations (OrgName text, LocationName text, DocID text)""") except ImportError: import warnings warnings.warn("Cannot import sqlite; sql flag will be ignored.") IN = re.compile(r'.*\bin\b(?!\b.+ing)') print print "IEER: in(ORG, LOC) -- just the clauses:" print "=" * 45 for file in ieer.fileids(): for doc in ieer.parsed_docs(file): if trace: print doc.docno print "=" * 15 for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print show_clause(rel, relsym='IN') if sql: try: rtuple = (rel['subjtext'], rel['objtext'], doc.docno) cur.execute( """insert into Locations values (?, ?, ?)""", rtuple) connection.commit() except NameError: pass if sql: try: cur.execute("""select OrgName from Locations where LocationName = 'Atlanta'""") print print "Extract data from SQL table: ORGs in Atlanta" print "-" * 15 for row in cur: print row except NameError: pass
def rels2rdf(ns, verbose=False): """ Convert the reldicts derived from the IEER corpus in an RDF Graph. """ graph = ConjunctiveGraph() graph.bind("nltk", BASE) graph.bind("org", "http://nltk.org/terms/org#") graph.bind("loc", "http://nltk.org/terms/loc#") graph.bind("pred", "http://nltk.org/terms/pred#") graph.bind("class", "http://nltk.org/terms/class#") in_uri = sym2uri(ns, "pred", "in") loc_uri = sym2uri(ns, "class", "Location") org_uri = sym2uri(ns, "class", "Organization") graph.add((in_uri, RDFNS.type, RDFSNS.Property)) graph.add((loc_uri, RDFNS.type, RDFSNS.Class)) graph.add((org_uri, RDFNS.type, RDFSNS.Class)) graph.add((in_uri, RDFSNS.domain, org_uri)) graph.add((in_uri, RDFSNS.range, loc_uri)) from nltk.corpus import ieer IN = re.compile(r".*\bin\b(?!\b.+ing\b)") for item in ieer.fileids(): for doc in ieer.parsed_docs(item): for reldict in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN): graph.add(make_rdf(ns, reldict, relsym="in")) for triple in make_rdfs(ns, reldict): graph.add(triple) return graph
def rels2rdf(ns, verbose=False): """ Convert the reldicts derived from the IEER corpus in an RDF Graph. """ graph = ConjunctiveGraph() graph.bind('nltk',BASE) graph.bind('org', "http://nltk.org/terms/org#") graph.bind('loc', "http://nltk.org/terms/loc#") graph.bind('pred', "http://nltk.org/terms/pred#") graph.bind('class', "http://nltk.org/terms/class#") in_uri = sym2uri(ns, 'pred', 'in') loc_uri = sym2uri(ns, 'class', 'Location') org_uri = sym2uri(ns, 'class', 'Organization') graph.add((in_uri, RDFNS.type, RDFSNS.Property)) graph.add((loc_uri, RDFNS.type, RDFSNS.Class)) graph.add((org_uri, RDFNS.type, RDFSNS.Class)) graph.add((in_uri, RDFSNS.domain, org_uri)) graph.add((in_uri, RDFSNS.range, loc_uri)) from nltk.corpus import ieer IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') for item in ieer.fileids(): for doc in ieer.parsed_docs(item): for reldict in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): graph.add(make_rdf(ns, reldict, relsym='in')) for triple in make_rdfs(ns, reldict): graph.add(triple) return graph
def in_demo(trace=0, sql=True): """ Select pairs of organizations and locations whose mentions occur with an intervening occurrence of the preposition "in". If the sql parameter is set to True, then the entity pairs are loaded into an in-memory database, and subsequently pulled out using an SQL "SELECT" query. """ from nltk.corpus import ieer if sql: try: import sqlite3 connection = sqlite3.connect(":memory:") connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() cur.execute("""create table Locations (OrgName text, LocationName text, DocID text)""") except ImportError: import warnings warnings.warn("Cannot import sqlite; sql flag will be ignored.") IN = re.compile(r'.*\bin\b(?!\b.+ing)') print() print("IEER: in(ORG, LOC) -- just the clauses:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): if trace: print(doc.docno) print("=" * 15) for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(clause(rel, relsym='IN')) if sql: try: rtuple = (rel['subjtext'], rel['objtext'], doc.docno) cur.execute("""insert into Locations values (?, ?, ?)""", rtuple) connection.commit() except NameError: pass if sql: try: cur.execute("""select OrgName from Locations where LocationName = 'Atlanta'""") print() print("Extract data from SQL table: ORGs in Atlanta") print("-" * 15) for row in cur: print(row) except NameError: pass
def ieer_headlines(): from nltk.corpus import ieer print("IEER: First 20 Headlines") print("=" * 45) trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print() print("%s:\n%s" % tree)
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print("IEER: First 20 Headlines") print("=" * 45) trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print() print("%s:\n%s" % tree)
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print "IEER: First 20 Headlines" print "=" * 45 trees = [doc.headline for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print print "%s:\n%s" % (doc.docno, tree)
def roles_demo(trace=0): from nltk.corpus import ieer roles = r""" (.*( # assorted roles analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* # "X, of (the) Y" """ ROLES = re.compile(roles, re.VERBOSE) print() print("IEER: has_role(PER, ORG) -- raw rtuples:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): lcon = rcon = False if trace: print(doc.docno) print("=" * 15) lcon = rcon = True for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES): print(rtuple(rel, lcon=lcon, rcon=rcon))
def roles_demo(trace=0): from nltk.corpus import ieer roles = """ (.*( # assorted roles analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* # "X, of (the) Y" """ ROLES = re.compile(roles, re.VERBOSE) print() print("IEER: has_role(PER, ORG) -- raw rtuples:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): lcon = rcon = False if trace: print(doc.docno) print("=" * 15) lcon = rcon = True for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(rtuple(rel, lcon=lcon, rcon=rcon))
from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER',
def main(): print "user input(1) or semcor(2)?" num = raw_input() if num == "1": #input print "enter word" word = raw_input() for meaning in (net.synsets(word)): #print "Sense: " + re.findall("'.*'", str(meaning))[0] print "Sense: " + str(meaning) print meaning.definition() + "\n" hypernyms = (meaning.hypernyms()) if len(hypernyms) > 0: print "\nHypernyms:" for meaning2 in hypernyms: print re.findall("'.*'", str(meaning2))[0] hyponyms = (meaning.hyponyms()) if len(hyponyms) > 0: print "\nHyponyms:" for meaning2 in hyponyms: print re.findall("'.*'", str(meaning2))[0] # print "\nHypernym Tree:" # print (gethypernymtree(meaning)) print "\n" # dog = wn.synset('dog.n.01') # hypo = lambda s: s.hyponyms() # hyper = lambda s: s.hypernyms() #list(dog.closure(s.hypernyms(), depth=1)) == dog.hypernyms() #True #>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() elif (num == "2"): #semcor print "semcor" for line in semcor.sents()[0:100]: s = "" for word in line: s = s + " " + word print s + "\n" for word in line: meanings = net.synsets(word) if len(meanings) > 0: print meanings[0].definition() elif num == "3": docs = ieer.parsed_docs('APW_19980424') tree = docs[1].text from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # The function relextract() allows us to filter the reldicts # according to the classes of the subject and object named entities. # In addition, we can specify that the filler text has to match a given regular expression, # as illustrated in the next example. Here, we are looking for pairs of entities in the IN # relation, where IN has signature <ORG, LOC>. IN = re.compile(r'(\s?[a-z].*)?\bin\b(?!\b.+ing\b)') for fileid in ieer.fileids(): print fileid for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS roles = "(.*(analyst|commissioner|professor|researcher|(spokes|chair)(wo)?m(e|a)n|writer|secretary|manager|commander|boss|founder)\s(of|in|for) (the)?)" ROLES = re.compile(roles, re.VERBOSE) for fileid in ieer.fileids(): for doc in ieer.parsed_docs(fileid): for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(relextract.rtuple(rel)) # doctest: +ELLIPSIS
from nltk.corpus import conll2000, conll2002 print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # SEMCOR from nltk.corpus import semcor print(semcor.words()) print(semcor.chunks()) print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE list(map(str, semcor.tagged_chunks(tag='both')[:3])) [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] # IEER from nltk.corpus import ieer ieer.fileids() # doctest: +NORMALIZE_WHITESPACE docs = ieer.parsed_docs('APW_19980314') print(docs[0]) print(docs[0].docno) print(docs[0].doctype) print(docs[0].date_time) print(docs[0].headline) print(docs[0].text) # doctest: +ELLIPSIS
def write_all_to_conll(): filenames = ieer.fileids() for filename in filenames: write_conll(filename) print 'Done with ', filename
""" This will convert the portion of the NIST 1999 IE-ER Dataset in NLTK to CONLL format. NOTE: unfortunately the IEER corpus has already tokenized the words for us but not the sentences, so we need to do a bit of pre-processing. """ from nltk.corpus import ieer from nltk.tree import Tree import nltk ieer.fileids() #print(docs[0].text) CONLL_WRITE = '../data/' def tree2conll_without_postags(t): tags = [] for child in t: try: category = child.label() if category in { 'LOCATION', 'PERSON', 'ORGANIZATION', 'DATE', 'TIME', 'DURATION', 'CARDINAL', 'MONEY', 'MEASURE' }: category = category[0:3] elif category == 'PERCENT': category = 'PCT' else:
print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('conll2002') print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('semcor') print(semcor.words()) print(semcor.chunks()) print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(list(map(str, semcor.tagged_chunks(tag='both')[:3]))) print([[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]) # nltk.download('ieer') print(ieer.fileids()) # doctest: +NORMALIZE_WHITESPACE docs = ieer.parsed_docs('APW_19980314') print(docs[0]) print(docs[0].docno) print(docs[0].doctype) print(docs[0].date_time) print(docs[0].headline) print(docs[0].text) # doctest: +ELLIPSIS # parsed corpora print(treebank.fileids()) # doctest: +ELLIPSIS print(treebank.words('wsj_0003.mrg')) print(treebank.tagged_words('wsj_0003.mrg')) print(treebank.parsed_sents('wsj_0003.mrg') [0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('ptb') print(ptb.fileids()) # doctest: +SKIP