Python word_splitting 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: NLP.word

메소드/함수: word_splitting

hotexamples.com에서의 예제들: 4

Python word_splitting - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 NLP.word.word_splitting에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def Extract_phrases_document (text, stop, split, all_pos_tags):		
	doc_phrases = {}
	sentences = []
	if split is None:
		sentences = NLP_sent.sentence_splitting(text, 1)
	else:
		can_sens = text.split(split)
		for can_sen in can_sens:
			sentences.extend(NLP_sent.sentence_splitting(can_sen, 1))
			
	for sentence in sentences:
		phrases = NLP_sent.phrase_splitting(sentence)		
		for phrase in phrases:
			if len(phrase) <= 2: # e.g.'ii'
				continue
			
			if phrase in all_pos_tags:
				pos_tags = all_pos_tags[phrase]
			else:
				#-------------------POS tagging output
				words = NLP_word.word_splitting(phrase.lower())
				pos_tags = NLP_word.word_pos_tagging(words)
				all_pos_tags[phrase] = pos_tags

	
			#-------------------parsed tree
			grammar = r"""
				NBAR:
					# Nouns and Adjectives, terminated with Nouns
					{<NN.*|JJ>*<NN.*>}
			
				NP:
					{<NBAR>}
					# Above, connected with in/of/etc...
					{<NBAR><IN><NBAR>}
			"""
	
			cp = nltk.RegexpParser(grammar, loop=2)
			cp_tree = cp.parse(pos_tags)
			terms = get_terms(cp_tree)
			for term in terms:
				phrase = ' '.join(term)
				if word_checking_stop(phrase, stop) ==0: # filter stop words
					if len(phrase)>1: # at least two characters
						doc_phrases[phrase] = 1
			
	return doc_phrases

예제 #2

파일 보기

파일: sentence_keywords.py 프로젝트: thnhan/Python-DataScience-CookBook

def keywords_syntax_nltk(sentence):
	global text_terms
	terms = []
	phrases = NLP_sent.phrase_splitting(sentence)		
	for phrase in phrases:
		if len(phrase) <= 2: # e.g.'ii'
			continue
		if phrase in text_terms:
			phrase_terms = text_terms[phrase]
		else:
			#-------------------POS tagging output
			words = NLP_word.word_splitting(phrase.lower())
			pos_tags = NLP_word.word_pos_tagging(words)
	
			#-------------------parsed tree
			grammar = r"""
				NBAR:
					# Nouns and Adjectives, terminated with Nouns
					{<NN.*|JJ>*<NN.*>}
			
				NP:
					{<NBAR>}
					# Above, connected with in/of/etc...
					{<NBAR><IN><NBAR>}
			"""
		
			cp = nltk.RegexpParser(grammar, loop=2)
			cp_tree = cp.parse(pos_tags)
			phrase_terms = get_terms(cp_tree)
			text_terms[phrase] = phrase_terms

		terms += phrase_terms 

	keywords = []
	for term in terms:
		if len(term) > 0:
			keywords.append(' '.join(term))
	return keywords

예제 #3

파일 보기

파일: sentence_keywords.py 프로젝트: thnhan/Python-DataScience-CookBook

def keywords_ngrams(sentence):
	ngrams = []
	phrases = NLP_sent.phrase_splitting(sentence)		
	for phrase in phrases:
		if len(phrase) <= 2: # e.g.'ii'
			continue
		words = NLP_word.word_splitting(phrase.lower())
		stop_pos = [] # record all positions of stop  or non-preferred (POS) words in the phrase to increase efficiency
		for i in range(len(words)):
			type = word_checking_stop(words[i])
			stop_pos.append(type)
		
		# Generate n-gram
		for i in range(len(words)):
			if 0 < stop_pos[i]:
				continue
			for j in reversed(range(i+1, min(len(words), i+4)+1)): # the maximum length of a ngram is 5
				if 0 < stop_pos[j-1]:# check validity
					continue
				ngram = ' '.join(words[i:j])
				if len(ngram)>2: # at least two characters
					ngrams.append(ngram)

	return ngrams

예제 #4

파일 보기

def Generate_ngrams_document (text, max_num, stop, use_pos, ptag, use_stem, split, all_pos_tags):
	
	doc_ngrams = {}
	sentences = []
	if split is None:
		sentences = NLP_sent.sentence_splitting(text, 1)
	else:
		can_sens = text.split(split)
		for can_sen in can_sens:
			sentences.extend(NLP_sent.sentence_splitting(can_sen, 1))
					
	for sentence in sentences:
		phrases = NLP_sent.phrase_splitting(sentence)		
		for phrase in phrases:
			if len(phrase) <= 2: # e.g.'ii'
				continue
			
			words = NLP_word.word_splitting(phrase.lower())
			if (use_pos):
				if phrase in all_pos_tags:
					pos_tags = all_pos_tags[phrase]
				else:
					pos_tags = NLP_word.word_pos_tagging(words)
					all_pos_tags[phrase] = pos_tags
			
			stop_pos = [] # record all positions of stop  or non-preferred (POS) words in the phrase to increase efficiency
			for i in xrange(len(words)):
				type = word_checking_stop(words[i], stop)
				stop_pos.append(type)
				if use_stem: # enable or disable stemming
					words[i] = porter2.stem(words[i])
			
#  			if  "patients who underwent" in phrase:
#  				print "aa"
			for i in xrange(len(words)):
				if 0 < stop_pos[i] < 5:
					continue
				for j in xrange(i+1, min(len(words), i+max_num)+1):
					if 0 < stop_pos[j-1] < 5:# check validity
						continue
 					meaningful_word = False
 					if (j == i +1):
 						if (stop_pos[i] == 0) and (not use_pos or (use_pos and word_checking_pos(pos_tags[i], ptag) == 0)):
 							meaningful_word = True
 					else:  
#  						if (use_pos and word_checking_pos(pos_tags[j-1], ptag) == 1):
#  							continue		
						mless_num = 0		
						for k in xrange(i,j):
							if stop_pos[k] ==0 or stop_pos[k]==5:
								meaningful_word =True
							else:
								mless_num +=1
						if mless_num>=(j-i-1):
							continue
  					if (meaningful_word):
						ngram = ' '.join(words[i:j])
						if len(ngram)>1: # at least two characters
							if (ngram in doc_ngrams):
									doc_ngrams[ngram] += 1
							else:
								doc_ngrams[ngram] = 1
		
	return doc_ngrams