Esempio n. 1
0
		else:
			# use the first word in the patt to split the parsed sentence 
			patt = patt.strip()
			get_np(split_word,parsed_sent,BACK)
			


# this file reads a file which contains the extracted pattern tuples , takes those patterns and used it extract NP from a parsed file 
# writes the regex patterns to a file , The actual matching can be done easily 	
if __name__ =="__main__":

	# reads the patterns tuples filename from the command line 

	filename = sys.argv[1]
	filename_out = filename +"_regex2"
	text = utility.f_read(filename)
	new_list = process_patterns(text)
	new_list.sort()
	# write each of these into a file 
	f_w = open(filename_out,'w')
	for line in new_list:
		f_w.write(line)
		f_w.write("\n")

	f_w.close()


	text = utility.f_read('victim_out_patterns_regex2')
  	lines = text.split('\n')
"""
	# read sample parsed text file 
Esempio n. 2
0
def process_input_text(file_text,id_name):
	# remove the \n from in between the lines
	(meta,main) = preprocess.split_text(file_text)
	if (not meta):
		print "ERROR IN SPLITTING MAIN AND META"
		return 
	if(not main):
		print "ERROR IN SPLITTING MAIN AND META"
		return
	#print proc_meta(meta)
		
	temp_victim_list = []
	final_victim_set =set([])
	temp_target_list = []
	final_target_set = set([])
	temp_perpi_list = []
	final_perpi_set = set([])

	file_text = re.sub(NEWLINE," ",main)
	file_text_list = file_text.split('\n')
	if(DEBUG):
		print ("processing text",main) 
		print ("")
	
	# pass file text instead of main in infoextract2.py 	
	incident_type = incident_predictor.get_predicted_event(main) 
	# TODO NER CALL A FUNCTION THAT returns NER DICT
	ner_tagged_text = process_ner.java_ner_tagger(file_text)
	if (ner_tagged_text):
		ner_tagged_text.strip()
		if(ner_tagged_text):
			ner_dict = process_ner.get_entities()

	if(ner_dict):
		print ner_dict
	# open file containing victim patterns
	text = utility.f_read('victim_out_patterns_regex2')
  	victim_patt_lines = text.split('\n')
	text = utility.f_read('target_out_patterns_regex2') # has only back patt
  	target_patt_lines = text.split('\n')
	text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns 
  	perp_patt_lines = text.split('\n')
	# ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing 


	# READ EACH LINE IN THE from input file   
	for line in file_text_list:
		line = line.strip()
		if(not line):
			continue

		# split each line into several sentences
		sents = utility.sent_splitter(line)
		for sent in sents:
			#print "processing line",line	
			# make sure no consecutive white spaces in ur line
			sent  = sent.strip()
			# TODO remove 's and `` from sentence remove `` as well ?
			sent = re.sub(SPATT,"",sent)			
			input_line = re.sub(COLL_SPACES,SPACES_REPL,sent)
			temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines)
			if temp_victim_list:
				for victim in temp_victim_list:
					victim  = victim.strip()
					if victim:
						final_victim_set.add(victim)
			# TARGET LIST
			temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines)
			if temp_target_list:
				for target in temp_target_list:
					target = target.strip()
					if target:
						final_target_set.add(target)
			# PERPI LIST
			temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines)
			if temp_perpi_list:
				for perp in temp_perpi_list:
					perp = perp.strip()
					if perp:
						final_perpi_set.add(perp)


			# now use algorithms to clean this list and to remove redundant stuff 
			# get target_list
	# a victim cannot be an org or location ?? has to be  a person 

	#subset removal
	v_new_list = list(final_victim_set)
	v_new_list  = utility.remove_subsets(v_new_list)	
	print "after subset removal"
	print v_new_list
	v_new_list = utility.remove_syn(v_new_list)
	print "after duplicate removal for ",id_name
	print v_new_list

	v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters
	print "after removing flag words   for ",id_name
	print v_new_list

	v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print v_new_list

	v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER
	print "after removing first title words like COLONEL etc ",id_name
	print v_new_list

	v_new_list = utility.one_word_cleaner(v_new_list)
	print "after one word and digit removal for ",id_name
	print v_new_list
	v_new_list = utility.victim_hacks(v_new_list)# e.g hacks
	print "after adding some hacks make unique",id_name
	print v_new_list
	print "###########################"

	# a target cannot be a a person or location 

	t_new_list  = list(final_target_set)
	t_new_list  = utility.remove_subsets(t_new_list)	
	print "after subset removal"
	print t_new_list
	t_new_list = utility.remove_syn(t_new_list)
	print "after duplicate removal"
	print t_new_list


	t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters
	print "after removing flag words   for ",id_name
	print t_new_list
	t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print t_new_list

	t_new_list = utility.one_word_cleaner(t_new_list)
	print "###Final after one word removal for ",id_name
	print t_new_list
	#print "###########################"


	# NER HINT a perpetrator cannot be a LOCATION or an org ??

	p_new_list  = list(final_perpi_set)
	p_new_list  = utility.remove_subsets(p_new_list)	
	print "after subset removal"
	print p_new_list
	p_new_list = utility.remove_syn(p_new_list)
	print "after duplicate removal"
	print p_new_list

	p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters
	print "after removing flag words   for ",id_name
	print p_new_list
	p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print p_new_list

	p_new_list = utility.one_word_cleaner(p_new_list)
	print " Final after one word and digit removal for ",id_name
	print p_new_list
	#print "###########################"


	#dict_out    = matching.match(parsed_text)
	#print ("")
	print_outf(id_name,incident_type,[],p_new_list,[],t_new_list,v_new_list)
Esempio n. 3
0
	out_list = []
	for line in lines:
		# handle empty lines 
		if(not line):
			print "line",line
			print "skipped a line"
			continue 
		m = re.match(PATT,line)
		if(m):
			line = line.lstrip('#')
			line = line.strip()
			out_list.append(line)
	# sort list so that we can identify duplicate patterns 		
	out_list.sort()
	return out_list

if (__name__=="__main__"):

	 filename = sys.argv[1]
	 print filename
	 text  = utility.f_read(filename)
	 out_lines = get_hash_lines(text)
	 filename_n = filename+"_patterns"
	 f_w = open(filename_n,'w')
	 for line in out_lines:
	 	s = "%s"%(line,)
	 	f_w.write(s)
		f_w.write("\n")
	 f_w.close()