def shallowParsing(): input_str = "A black television and a white stove were bought for the new apartment of John." result = TextBlob(input_str) #print(result.tags) #chunking reg_exp = "NP: {<DT>?<JJ>*<NN>}" rp = nltk.RegexpParser(reg_exp) result = rp.parse(result.tags) print(result) result.draw()
# (‘and’, u’CC’), (‘a’, u’DT’), (‘white’, u’JJ’), (‘stove’, u’NN’), # (‘were’, u’VBD’), (‘bought’, u’VBN’), (‘for’, u’IN’), (‘the’, u’DT’), # (‘new’, u’JJ’), (‘apartment’, u’NN’), (‘of’, u’IN’), (‘John’, u’NNP’)] # The second step is chunking: reg_exp = “NP: {<DT>?<JJ>*<NN>}” rp = nltk.RegexpParser(reg_exp) result = rp.parse(result.tags) print(result) # Output # (S (NP A/DT black/JJ television/NN) and/CC (NP a/DT white/JJ stove/NN) # were/VBD bought/VBN for/IN (NP the/DT new/JJ apartment/NN)of/IN John/NNP) # It’s also possible to draw the sentence tree structure using code result.draw() # 7.5. Named entity recognition # Named-entity recognition (NER) aims to find named entities in text and classify # them into pre-defined categories (names of persons, locations, organizations, times, etc.). # Named-entity recognition using NLTK: from nltk import word_tokenize, pos_tag, ne_chunk input_str = “Bill works for Apple so he went to Boston for a conference.” print ne_chunk(pos_tag(word_tokenize(input_str))) # Output: # (S (PERSON Bill/NNP) works/VBZ for/IN Apple/NNP so/IN he/PRP went/VBD to/TO (GPE Boston/NNP) for/IN a/DT conference/NN ./.) # 7.6. Coreference resolution (anaphora resolution)
def main1(): input_str=sys.argv[1] input_str = input_str.lower() input_str = input_str.strip() print(input_str) punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' no_punct = "" for char in input_str: if char not in punctuations: no_punct = no_punct + char input_str=no_punct print(input_str) stop_words = set(stopwords.words('english')) tokens = word_tokenize(input_str) input_str = [i for i in tokens if not i in stop_words] print (input_str) input_str2=[] for i in range(0,len(input_str)): l=[] l=wordninja.split(input_str[i]) if len(l)>1: for j in l: input_str2.append(j) else: input_str2.append(l[0]) input_str=input_str2 input_str=f7(input_str) input_str=' '.join(input_str) nlp = spacy.load('en_core_web_sm') sentence = input_str doc = nlp(sentence) for ent in doc.ents: print(ent.text, ent.label_) lemmatizer=WordNetLemmatizer() input_str=word_tokenize(input_str) print(input_str) for i in range(len(input_str)): input_str[i]=lemmatizer.lemmatize(input_str[i]) input_str=' '.join(input_str) result = TextBlob(input_str) print(result.tags) reg_exp = "NP: {<DT>?<JJ>*<NN>}" rp = nltk.RegexpParser(reg_exp) result = rp.parse(result.tags) print(result) result.draw() print(input_str) return input_str