Ejemplo n.º 1
0
def get_targets(sent,patt_lines):

	pot_target_list = []
	matched_patt_word = []
	for patt in patt_lines:
		if (not patt):
			continue
		patt = re.sub(COLL_SPACES,SPACES_REPL,patt)

		m = re.findall(patt,sent)
		if m:
			# check forward or backward 
			if(DEBUG):
				print "pattern matched ",m,"for patt ",patt,"and sent",sent
			# Now parse this line
			parsed_sent = parse_file(sent)
			if (not parsed_sent):
				print "could not parse line"+parsed_sent
				continue 
			# NOW NP CHUNK THE SENTENCE
			# First make sense of parsed input  	
			pos_dict,parse_dict,_ = process_parse.pprocess_pline(parsed_sent)
			# the above might return multiple lines 
			for i in xrange(len(pos_dict.keys())):
				pos_sent = pos_dict[i]
				parsed_sent = parse_dict[i]
				# NP chunking algo
				np_sent= process_parse.extract_np(pos_sent,parsed_sent)
				np_chunk_sent = process_parse.assemble_extracts(np_sent)
				# MATCHES BACK PATTERN 
				# use the first word in the patt to split the parsed sentence 
				patt = patt.strip()
				if(not patt):
					print "patt was empty line move to next"
					continue
				split_patt = patt.split()
				split_word = split_patt[0]
				split_word = split_word.strip()
				if split_word in matched_patt_word:
					if(DEBUG):
						print "###not matching back pattern since back pattern with same key word was matched ,back key word =",split_word
					continue
				m_temp  = re.search(split_word,np_chunk_sent)
				if(not m_temp):
					print "split word=",split_word,"not in sent"
					continue 
				pot_target_list = get_np(split_word,np_chunk_sent,BACK,'target')
					 
	# search for AND IN THE np if it exists divide the np into two parts 		
	new_list = and_detector(pot_target_list)	
	return new_list
Ejemplo n.º 2
0
def get_perpi(sent,patt_lines):

	pot_perpi_list = []
	matched_patt_word = []
	for patt in patt_lines:

		if (not patt):
			continue
		#m2 = re.search('MURDERED',patt)
		#if m2:
		#	print "patt",patt
		# collapse multiple white spaces 
		patt = re.sub(COLL_SPACES,SPACES_REPL,patt)
		# check if any of the victim patterns exist for this line
		m = re.findall(patt,sent)
		if m:
			# check forward or backward 
			if(DEBUG):
				print "pattern matched ",m,"for patt ",patt,"and sent",sent
			# Now parse this line
			parsed_sent = parse_file(sent)
			if (not parsed_sent):
				print "could not parse line"+parsed_sent
				continue 
			# NOW NP CHUNK THE SENTENCE
			# First make sense of parsed input  	
			pos_dict,parse_dict,_ = process_parse.pprocess_pline(parsed_sent)
			# the above might return multiple lines 
			for i in xrange(len(pos_dict.keys())):
				pos_sent = pos_dict[i]
				parsed_sent = parse_dict[i]
				# NP chunking algo
				np_sent= process_parse.extract_np(pos_sent,parsed_sent)
				np_chunk_sent = process_parse.assemble_extracts(np_sent)

				if(is_front(patt)):
					#perpi just have one word ( as of now) so just split by word
					patt = patt.strip()
					if(not patt):
						print "patt was empty line move to next"
						continue
					split_patt = patt.split()
					split_word = split_patt[0]
					split_word = split_word.strip()
					# THIS MAKES SURE THAT a FONT PATT IS NOT MATCHED AGAIN BY BACK PATT 
					#matched_patt_word.append(split_word)
					m_temp  = re.search(split_word,np_chunk_sent)
					if(not m_temp):
						print "split word=",split_word,"not in sent"
						continue 
					pot_perpi_list = get_np(split_word,np_chunk_sent,FRONT,'perpi')
					if(len(pot_perpi_list) > 0): 
						# THIS MAKES SURE THAT a FONT PATT IS NOT MATCHED AGAIN BY BACK PATT 
						matched_patt_word.append(split_word)
				else:
					# MATCHES BACK PATTERN 
					# Back patterns have three words ..second last word is the main word / split word  
					patt = patt.strip()
					if(not patt):
						print "patt was empty line move to next"
						continue
					split_patt = patt.split()
					# second last word or second word is the main word  
					split_word = split_patt[1]
					split_word = split_word.strip()
					if split_word in matched_patt_word:
						print "###not matching back pattern since back pattern with same key word was matched ,back key word =",split_word
						continue
					m_temp  = re.search(split_word,np_chunk_sent)
					if(not m_temp):
						print "split word=",split_word,"not in sent"
						continue 
					pot_perpi_list = get_np(split_word,np_chunk_sent,BACK,'perpi')
					 
	# search for AND IN THE np if it exists divide the np into two parts 		
	new_list = and_detector(pot_perpi_list)	
	
	return new_list