예제 #1
0
def Extract_phrases_document (text, stop, split, all_pos_tags):		
	doc_phrases = {}
	sentences = []
	if split is None:
		sentences = NLP_sent.sentence_splitting(text, 1)
	else:
		can_sens = text.split(split)
		for can_sen in can_sens:
			sentences.extend(NLP_sent.sentence_splitting(can_sen, 1))
			
	for sentence in sentences:
		phrases = NLP_sent.phrase_splitting(sentence)		
		for phrase in phrases:
			if len(phrase) <= 2: # e.g.'ii'
				continue
			
			if phrase in all_pos_tags:
				pos_tags = all_pos_tags[phrase]
			else:
				#-------------------POS tagging output
				words = NLP_word.word_splitting(phrase.lower())
				pos_tags = NLP_word.word_pos_tagging(words)
				all_pos_tags[phrase] = pos_tags

	
			#-------------------parsed tree
			grammar = r"""
				NBAR:
					# Nouns and Adjectives, terminated with Nouns
					{<NN.*|JJ>*<NN.*>}
			
				NP:
					{<NBAR>}
					# Above, connected with in/of/etc...
					{<NBAR><IN><NBAR>}
			"""
	
			cp = nltk.RegexpParser(grammar, loop=2)
			cp_tree = cp.parse(pos_tags)
			terms = get_terms(cp_tree)
			for term in terms:
				phrase = ' '.join(term)
				if word_checking_stop(phrase, stop) ==0: # filter stop words
					if len(phrase)>1: # at least two characters
						doc_phrases[phrase] = 1
			
	return doc_phrases
def keywords_syntax_nltk(sentence):
	global text_terms
	terms = []
	phrases = NLP_sent.phrase_splitting(sentence)		
	for phrase in phrases:
		if len(phrase) <= 2: # e.g.'ii'
			continue
		if phrase in text_terms:
			phrase_terms = text_terms[phrase]
		else:
			#-------------------POS tagging output
			words = NLP_word.word_splitting(phrase.lower())
			pos_tags = NLP_word.word_pos_tagging(words)
	
			#-------------------parsed tree
			grammar = r"""
				NBAR:
					# Nouns and Adjectives, terminated with Nouns
					{<NN.*|JJ>*<NN.*>}
			
				NP:
					{<NBAR>}
					# Above, connected with in/of/etc...
					{<NBAR><IN><NBAR>}
			"""
		
			cp = nltk.RegexpParser(grammar, loop=2)
			cp_tree = cp.parse(pos_tags)
			phrase_terms = get_terms(cp_tree)
			text_terms[phrase] = phrase_terms

		terms += phrase_terms 

	keywords = []
	for term in terms:
		if len(term) > 0:
			keywords.append(' '.join(term))
	return keywords
def keywords_ngrams(sentence):
	ngrams = []
	phrases = NLP_sent.phrase_splitting(sentence)		
	for phrase in phrases:
		if len(phrase) <= 2: # e.g.'ii'
			continue
		words = NLP_word.word_splitting(phrase.lower())
		stop_pos = [] # record all positions of stop  or non-preferred (POS) words in the phrase to increase efficiency
		for i in range(len(words)):
			type = word_checking_stop(words[i])
			stop_pos.append(type)
		
		# Generate n-gram
		for i in range(len(words)):
			if 0 < stop_pos[i]:
				continue
			for j in reversed(range(i+1, min(len(words), i+4)+1)): # the maximum length of a ngram is 5
				if 0 < stop_pos[j-1]:# check validity
					continue
				ngram = ' '.join(words[i:j])
				if len(ngram)>2: # at least two characters
					ngrams.append(ngram)

	return ngrams
예제 #4
0
def Generate_ngrams_document (text, max_num, stop, use_pos, ptag, use_stem, split, all_pos_tags):
	
	doc_ngrams = {}
	sentences = []
	if split is None:
		sentences = NLP_sent.sentence_splitting(text, 1)
	else:
		can_sens = text.split(split)
		for can_sen in can_sens:
			sentences.extend(NLP_sent.sentence_splitting(can_sen, 1))
					
	for sentence in sentences:
		phrases = NLP_sent.phrase_splitting(sentence)		
		for phrase in phrases:
			if len(phrase) <= 2: # e.g.'ii'
				continue
			
			words = NLP_word.word_splitting(phrase.lower())
			if (use_pos):
				if phrase in all_pos_tags:
					pos_tags = all_pos_tags[phrase]
				else:
					pos_tags = NLP_word.word_pos_tagging(words)
					all_pos_tags[phrase] = pos_tags
			
			stop_pos = [] # record all positions of stop  or non-preferred (POS) words in the phrase to increase efficiency
			for i in xrange(len(words)):
				type = word_checking_stop(words[i], stop)
				stop_pos.append(type)
				if use_stem: # enable or disable stemming
					words[i] = porter2.stem(words[i])
			
#  			if  "patients who underwent" in phrase:
#  				print "aa"
			for i in xrange(len(words)):
				if 0 < stop_pos[i] < 5:
					continue
				for j in xrange(i+1, min(len(words), i+max_num)+1):
					if 0 < stop_pos[j-1] < 5:# check validity
						continue
 					meaningful_word = False
 					if (j == i +1):
 						if (stop_pos[i] == 0) and (not use_pos or (use_pos and word_checking_pos(pos_tags[i], ptag) == 0)):
 							meaningful_word = True
 					else:  
#  						if (use_pos and word_checking_pos(pos_tags[j-1], ptag) == 1):
#  							continue		
						mless_num = 0		
						for k in xrange(i,j):
							if stop_pos[k] ==0 or stop_pos[k]==5:
								meaningful_word =True
							else:
								mless_num +=1
						if mless_num>=(j-i-1):
							continue
  					if (meaningful_word):
						ngram = ' '.join(words[i:j])
						if len(ngram)>1: # at least two characters
							if (ngram in doc_ngrams):
									doc_ngrams[ngram] += 1
							else:
								doc_ngrams[ngram] = 1
		
	return doc_ngrams