def remove_npstuff(sent): extrcts = [] spl = re.split('(\[.*?\])', sent) # loop over all whole extracted pattern and find the []/NP pattern for cleaning for e in spl: lspl = len(e) if lspl < 4: extrcts.append(e) elif e[0] == '[' and e[-1] == ']': extrct = utility.np_cleaner(e) if extrct != '': extrcts.append(extrct) else: extrcts.append(e) return ' '.join(extrcts)
def get_np(word,parsed_sent,is_front,meta): out_list = [] word = word.strip() parsed_sent = parsed_sent.strip() if(is_front): # ABC WAS ASSASINATED # splits according to first occurence of word temp_arr = parsed_sent.split(word) if(len(temp_arr)) > 2: if(DEBUG): print "sent has more than two "+word+" parsed sent = "+parsed_sent for i in xrange(len(temp_arr)-1): first_half = temp_arr[i] # search for NP thing here if(DEBUG): print "first part",first_half # check if there was a half split #m_half = re.search(PATT_LHALF_NP,first_half) m_half = re.findall(PATT_LHALF_NP,first_half) m = re.findall(PATT_NP,first_half) if m_half: #np = m_half.group(0) for np in m_half: np_clean = utility.np_cleaner(np) if (np_clean): out_list.append(np_clean) if m: # we need the rightmost pattern found m[-1] not necessarily for Front patterns look at all NP ? for np in m: #np = m[-1] np_clean = utility.np_cleaner(np) if (np_clean): out_list.append(np_clean) else: first_half = temp_arr[0] # search for NP thing here if (DEBUG): print "first half",first_half # check if there was a half split #m_half = re.search(PATT_LHALF_NP,first_half) m_half = re.findall(PATT_LHALF_NP,first_half) m = re.findall(PATT_NP,first_half) if m_half: #np = m_half.group(0) for np in m_half: np_clean = utility.np_cleaner(np) if(np_clean): out_list.append(np_clean) if m: # we need the rightmost pattern found m[-1] for np in m: #np = m[-1] np_clean = utility.np_cleaner(np) if (np_clean): out_list.append(np_clean) else: # murder of DEf temp_arr = parsed_sent.split(word) if(len(temp_arr)) > 2: # murder of dEF and murder of eFg and murder of xyz if(DEBUG): print "sent has more than two "+word+"parsed sent = "+parsed_sent for i in xrange(len(temp_arr)-1): # for e.g for two instance of murder we will have temp_arr[1] and temp_arr[2] second_half = temp_arr[i+1] # search for NP thing here if(DEBUG): print "second part",second_half # search usually progresses from left to right so this should be good #m_half = re.search(PATT_RHALF_NP,second_half) m_half = re.findall(PATT_RHALF_NP,second_half) #m = re.search(PATT_NP,second_half) m = re.findall(PATT_NP,second_half) if m_half: #np = m_half.group(0) for np in m_half: np_clean = utility.np_cleaner(np) if (np_clean): out_list.append(np_clean) if m: #np = m.group(0) for np in m: np_clean = utility.np_cleaner(np) if (np_clean): out_list.append(np_clean) elif(len(temp_arr) == 2): second_half = temp_arr[1] # search for NP thing here if(DEBUG): print "second half",second_half # search usually progresses from left to right so this should be good #m_half = re.search(PATT_RHALF_NP,second_half) m_half = re.findall(PATT_RHALF_NP,second_half) #m = re.search(PATT_NP,second_half) m = re.findall(PATT_NP,second_half) if m_half: #np = m_half.group(0) for np in m_half: np_clean = utility.np_cleaner(np) if (np_clean): out_list.append(np_clean) if m: #np = m.group(0) for np in m: np_clean = utility.np_cleaner(np) if (np_clean): out_list.append(np_clean) out_set = set(out_list) # further process this list out_list = list(out_set) new_list = utility.common_cleaner(out_list) if(meta =='victim'): new_list = utility.victim_cleaner(new_list) if(DEBUG): print "####victim removal list" elif(meta == 'target'): new_list = utility.target_cleaner(new_list) if(DEBUG): print "####target removal list" elif(meta == 'perpi'): new_list = utility.perpi_cleaner(new_list) if(DEBUG): print "####perp removal list" if(DEBUG): print new_list return new_list