Ejemplo n.º 1
0
def process_input_text(file_text, id_name):
    global KEY

    (meta, main) = preprocess.split_text(file_text)
    if not meta:
        print "ERROR IN SPLITTING MAIN AND META"
        return
    if not main:
        print "ERROR IN SPLITTING MAIN AND META"
        return

    file_text = re.sub(NEWLINE, " ", main)
    if DEBUG:
        print ("processing text", main)
        print ("")

    d = answr_dict()

    if not KEY:
        make_key()

    grammar = r"""
	NP: {<RB|PP\$>?<JJ>*<NN>+<POS>?}
	NP: {<RB|PP\$>?<JJ>*<NN>+<NNS>*}
	    {<NNP>+}
	    {<RB|PP\$>?<JJ>*<NNS>*<POS>?}
	"""
    # sents = map(pos_tag, map(word_tokenize, [s for s in sent_tokenize(file_text.lower())]))
    # cp = RegexpParser(grammar)
    # for s in sents:
    # 	print cp.parse(s)
    weapons = get_weapon(file_text, d)
    print weapons
    weapon = weapons[0][0]
    print id_name
    print "C", KEY[id_name], "\n", "D", weapon
    print
    # perpindiv = get_perp_indiv(file_text, d)
    perpindiv = "-"
    # perporg = get_perp_org(file_text, d)
    perporg = "-"
    # targets = get_target(file_text, d)
    # target = targets[0][0]
    target = "-"
    # victims = get_victim(file_text, d)
    # victim = victims[0][0]
    victim = "-"

    incident_type = incident_predictor.get_predicted_event(main)
    print_out(id_name, incident_type, weapon, perpindiv, perporg, target, victim)
Ejemplo n.º 2
0
def process_input_text(file_text,id_name):
	# remove the \n from in between the lines
	(meta,main) = preprocess.split_text(file_text)
	if (not meta):
		print "ERROR IN SPLITTING MAIN AND META"
		return 
	if(not main):
		print "ERROR IN SPLITTING MAIN AND META"
		return
	#print proc_meta(meta)
		
	temp_victim_list = []
	final_victim_set =set([])
	temp_target_list = []
	final_target_set = set([])
	temp_perpi_list = []
	final_perpi_set = set([])

	file_text = re.sub(NEWLINE," ",main)
	file_text_list = file_text.split('\n')
	if(DEBUG):
		print ("processing text",main) 
		print ("")
	
	# pass file text instead of main in infoextract2.py 	
	incident_type = incident_predictor.get_predicted_event(main) 
	# TODO NER CALL A FUNCTION THAT returns NER DICT
	ner_tagged_text = process_ner.java_ner_tagger(file_text)
	if (ner_tagged_text):
		ner_tagged_text.strip()
		if(ner_tagged_text):
			ner_dict = process_ner.get_entities()

	if(ner_dict):
		print ner_dict
	# open file containing victim patterns
	text = utility.f_read('victim_out_patterns_regex2')
  	victim_patt_lines = text.split('\n')
	text = utility.f_read('target_out_patterns_regex2') # has only back patt
  	target_patt_lines = text.split('\n')
	text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns 
  	perp_patt_lines = text.split('\n')
	# ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing 


	# READ EACH LINE IN THE from input file   
	for line in file_text_list:
		line = line.strip()
		if(not line):
			continue

		# split each line into several sentences
		sents = utility.sent_splitter(line)
		for sent in sents:
			#print "processing line",line	
			# make sure no consecutive white spaces in ur line
			sent  = sent.strip()
			# TODO remove 's and `` from sentence remove `` as well ?
			sent = re.sub(SPATT,"",sent)			
			input_line = re.sub(COLL_SPACES,SPACES_REPL,sent)
			temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines)
			if temp_victim_list:
				for victim in temp_victim_list:
					victim  = victim.strip()
					if victim:
						final_victim_set.add(victim)
			# TARGET LIST
			temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines)
			if temp_target_list:
				for target in temp_target_list:
					target = target.strip()
					if target:
						final_target_set.add(target)
			# PERPI LIST
			temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines)
			if temp_perpi_list:
				for perp in temp_perpi_list:
					perp = perp.strip()
					if perp:
						final_perpi_set.add(perp)


			# now use algorithms to clean this list and to remove redundant stuff 
			# get target_list
	# a victim cannot be an org or location ?? has to be  a person 

	#subset removal
	v_new_list = list(final_victim_set)
	v_new_list  = utility.remove_subsets(v_new_list)	
	print "after subset removal"
	print v_new_list
	v_new_list = utility.remove_syn(v_new_list)
	print "after duplicate removal for ",id_name
	print v_new_list

	v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters
	print "after removing flag words   for ",id_name
	print v_new_list

	v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print v_new_list

	v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER
	print "after removing first title words like COLONEL etc ",id_name
	print v_new_list

	v_new_list = utility.one_word_cleaner(v_new_list)
	print "after one word and digit removal for ",id_name
	print v_new_list
	v_new_list = utility.victim_hacks(v_new_list)# e.g hacks
	print "after adding some hacks make unique",id_name
	print v_new_list
	print "###########################"

	# a target cannot be a a person or location 

	t_new_list  = list(final_target_set)
	t_new_list  = utility.remove_subsets(t_new_list)	
	print "after subset removal"
	print t_new_list
	t_new_list = utility.remove_syn(t_new_list)
	print "after duplicate removal"
	print t_new_list


	t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters
	print "after removing flag words   for ",id_name
	print t_new_list
	t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print t_new_list

	t_new_list = utility.one_word_cleaner(t_new_list)
	print "###Final after one word removal for ",id_name
	print t_new_list
	#print "###########################"


	# NER HINT a perpetrator cannot be a LOCATION or an org ??

	p_new_list  = list(final_perpi_set)
	p_new_list  = utility.remove_subsets(p_new_list)	
	print "after subset removal"
	print p_new_list
	p_new_list = utility.remove_syn(p_new_list)
	print "after duplicate removal"
	print p_new_list

	p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters
	print "after removing flag words   for ",id_name
	print p_new_list
	p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print p_new_list

	p_new_list = utility.one_word_cleaner(p_new_list)
	print " Final after one word and digit removal for ",id_name
	print p_new_list
	#print "###########################"


	#dict_out    = matching.match(parsed_text)
	#print ("")
	print_outf(id_name,incident_type,[],p_new_list,[],t_new_list,v_new_list)