Esempio n. 1
0
def shallowParsing():
    input_str = "A black television and a white stove were bought for the new apartment of John."
    result = TextBlob(input_str)
    #print(result.tags)
    #chunking
    reg_exp = "NP: {<DT>?<JJ>*<NN>}"
    rp = nltk.RegexpParser(reg_exp)
    result = rp.parse(result.tags)
    print(result)
    result.draw()
Esempio n. 2
0
			# (‘and’, u’CC’), (‘a’, u’DT’), (‘white’, u’JJ’), (‘stove’, u’NN’), 
			# (‘were’, u’VBD’), (‘bought’, u’VBN’), (‘for’, u’IN’), (‘the’, u’DT’), 
			# (‘new’, u’JJ’), (‘apartment’, u’NN’), (‘of’, u’IN’), (‘John’, u’NNP’)]
			
		# The second step is chunking:
			reg_exp = “NP: {<DT>?<JJ>*<NN>}”
			rp = nltk.RegexpParser(reg_exp)
			result = rp.parse(result.tags)
			print(result)
			
			# Output
			# (S (NP A/DT black/JJ television/NN) and/CC (NP a/DT white/JJ stove/NN) 
			# were/VBD bought/VBN for/IN (NP the/DT new/JJ apartment/NN)of/IN John/NNP)
			
		# It’s also possible to draw the sentence tree structure using code
			result.draw()
			
	# 7.5. Named entity recognition
		# Named-entity recognition (NER) aims to find named entities in text and classify
		# them into pre-defined categories (names of persons, locations, organizations, times, etc.).
		
		# Named-entity recognition using NLTK:
			from nltk import word_tokenize, pos_tag, ne_chunk
			input_str = “Bill works for Apple so he went to Boston for a conference.”
			print ne_chunk(pos_tag(word_tokenize(input_str)))
			
		# Output:
		# (S (PERSON Bill/NNP) works/VBZ for/IN Apple/NNP so/IN he/PRP went/VBD to/TO (GPE Boston/NNP) for/IN a/DT conference/NN ./.)
		
	
	# 7.6. Coreference resolution (anaphora resolution)
def main1():
    input_str=sys.argv[1]
    input_str = input_str.lower()
    input_str = input_str.strip()
    print(input_str)
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

    no_punct = ""
    for char in input_str:
        if char not in punctuations:
            no_punct = no_punct + char

    input_str=no_punct
    print(input_str)
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(input_str)
    input_str = [i for i in tokens if not i in stop_words]
    print (input_str)
    input_str2=[]
    for i in range(0,len(input_str)):
        l=[]
        l=wordninja.split(input_str[i])
        if len(l)>1:
            for j in l:
                input_str2.append(j)
        else:
            input_str2.append(l[0])
    
    input_str=input_str2

    input_str=f7(input_str)
    input_str=' '.join(input_str)

    nlp = spacy.load('en_core_web_sm') 
  
    sentence = input_str
  
    doc = nlp(sentence) 
  
    for ent in doc.ents: 
        print(ent.text, ent.label_)

    lemmatizer=WordNetLemmatizer()
    
    input_str=word_tokenize(input_str)
    print(input_str)
    for i in range(len(input_str)):
        input_str[i]=lemmatizer.lemmatize(input_str[i])
    
    input_str=' '.join(input_str)
    result = TextBlob(input_str)
    print(result.tags)

    reg_exp = "NP: {<DT>?<JJ>*<NN>}"
    rp = nltk.RegexpParser(reg_exp)
    result = rp.parse(result.tags)
    print(result)

    result.draw()
    
    print(input_str)
    return input_str