print("\nPOS tags") print(pos_tags) listchunked = strchunked.split() # print(len(listchunked)) # print(len(pos_tags)) print("\n\n") ph = "" sentence = [] k = 0 while k < len(listchunked): if listchunked[k + 1] == '<S-NP>' or listchunked[ k + 1] == '<S-VP>' or listchunked[k + 1] == '<S-PP>': ph = listchunked[k] sentence.append([(ph), (listchunked[k + 1][-3:-1])]) ph = "" k += 2 # print("S") elif listchunked[k + 1] == '<B-NP>': # ph = ph + listchunked[k] while listchunked[k + 1] != '<E-NP>': ph = ph.strip() + " " + listchunked[k] k += 2 ph = ph.strip() + " " + listchunked[k] sentence.append([(ph), (listchunked[k + 1][-3:-1])]) ph = "" k += 2 # print("BNP") elif listchunked[k + 1] == '<B-VP>': while listchunked[k + 1] != '<E-VP>':
def getTagged(ex, show, main_triplet): sentence = Sentence(ex) tagger.predict(sentence) strchunked = sentence.to_tagged_string() # if(show): # # print("\n") # print(sentence) # print("\nChunked sentence") # print(strchunked) nlp = en_core_web_sm.load() doc = nlp(ex) # pos_tags = [(i, i.tag_) for i in doc] pos_tags = nltk.pos_tag(nltk.word_tokenize(ex)) listchunked = strchunked.split() # print(len(listchunked)) # print(len(pos_tags)) ph = "" sentence = [] k = 0 while k + 1 < len(listchunked): if listchunked[k + 1] == '<S-NP>' or listchunked[ k + 1] == '<S-VP>' or listchunked[k + 1] == '<S-PP>': ph = listchunked[k] sentence.append([(ph), (listchunked[k + 1][-3:-1])]) ph = "" k += 2 # print("S") elif listchunked[k + 1] == '<B-NP>': # ph = ph + listchunked[k] while (k + 1 < len(listchunked) and listchunked[k + 1] != '<E-NP>'): if not (listchunked[k][0] == '<' and listchunked[k][-1] == '>'): ph = ph.strip() + " " + listchunked[k] k += 1 ph = ph.strip() + " " + listchunked[k] sentence.append([ph, 'NP']) ph = "" k += 2 # print("BNP") elif listchunked[k + 1] == '<B-VP>': while (k + 1 < len(listchunked) and listchunked[k + 1] != '<E-VP>'): if not (listchunked[k][0] == '<' and listchunked[k][-1] == '>'): ph = ph.strip() + " " + listchunked[k] k += 1 ph = ph.strip() + " " + listchunked[k] sentence.append([ph, 'VP']) ph = "" k += 2 # print(BVP) elif not (listchunked[k + 1][0] == '<' and listchunked[k + 1][-1] == '>'): #happens with 'CC' sentence.append([listchunked[k], 'CC']) k += 1 else: k += 2 # print("here") # if(show): # # print("\n\n") # # print("CHUNKS from spacy") # for x in sentence: # print(x) tf = False k = m = 0 sentence2 = [] while k < len(sentence): ph = "" for x in nltk.word_tokenize(sentence[k][0]): m2 = m #old value of m while m < len(pos_tags): #find the pos tag of x if (x.strip('.') == pos_tags[m][0].strip('.') ): #found the pos tag of x if (pos_tags[m][1] == "RB" or pos_tags[m][1] == "DT" or pos_tags[m][1] == "." or pos_tags[m][1] == "``"): break if (pos_tags[m][1] == ","): if (len(ph) > 0): sentence2.append([ph.strip(), sentence[k][1]]) ph = "" else: ph = ph.strip() + " " + x + "^" + pos_tags[m][1] m += 1 break # elif (x == "have"): # print(pos_tags[m], m) m += 1 if (m == len(pos_tags) and m2 != len(pos_tags)): m = m2 if (len(ph) > 0): sentence2.append([ph.strip(), sentence[k][1]]) elif (sentence[k][0][-1] != ',' and ' ' not in sentence[k][0]): sentence2.append([sentence[k][0] + "^NN", sentence[k][1]]) k += 1 # if(show): # print("\n\n") # print("CHUNKS with POS tags") sentence = sentence2 k = 0 while k < len(sentence): ph = sentence[k][0].split() p = 0 vbfound = False s = "" while p < len(ph): try: fflag = ((re.search(r'.*\^', ph[p]).group()[:-1] == "'s" or re.search(r'.*\^', ph[p]).group()[:-1] == "'") and len(re.search(r'.*\^', ph[p]).group()[:-1]) <= 2) except: # print(sentence) input('ERROR') if ((re.search(r'.*\^', ph[p]).group()[:-1] == "'s" or re.search(r'.*\^', ph[p]).group()[:-1] == "'") and len(re.search(r'.*\^', ph[p]).group()[:-1]) <= 2): if (p - 2 >= 0): s = ' '.join(ph[:p - 1]) s = s + " " + re.search( r'.*\^', ph[p - 1]).group()[:-1] + "'s^POS " + ' '.join( ph[p + 1:]) # print(s) # input('ENTER') sentence[k] = (s.strip(), sentence[k][1]) ph = sentence[k][0].split() p -= 1 s = "" # if ("VB" in re.search(r'\^.*',ph[p]).group()[1:]): # if not(vbfound): # vbfound = True # else: # s = ' '.join(ph[p:]) # sentence[k] = (s.strip(), sentence[k][1]) if (re.search(r'\^.*', ph[p]).group()[1:][0] == 'W'): sentence = sentence[:k] + (sentence[k + 1:] if k + 1 < len(sentence) else []) k -= 1 break # if ("NN" in re.search(r'\^.*',ph[p]).group()[1:]): p += 1 k += 1 if (show): print("\n\n\tINTIAL CHUNK AND POS SENTENCE") for x in sentence: print(x) sentence_pos = sentence # print("SP", sentence_pos) # tripletsADDON = [] for x in sentence_pos: # print(x, len(x), x[0], len(x[0].split())) if (len(x[0].split()) > 1): # print("Use this", x[0]) a = x[0].split() list_fin = [] sen = " " for aa in a: list_aa = [] # print("AAA", aa) aaaa = aa.split('^')[0] #word sen = sen + " " + aaaa # print("SEN", sen) bbbb = aa.split('^')[1] #pos tag list_aa.append(aaaa) list_aa.append(bbbb) # print("LIST_AA", list_aa) tuple_aa = tuple(list_aa) # print("TUUU", tuple_aa) list_fin.append(tuple_aa) sen = sen.lstrip() print("SEN", sen) # trip = [['He', 'bought', 'four new cars']] triplet_sentence = [] for tr in main_triplet: for t in tr: if sen == t: break else: triplet_sentence.append(t) print(triplet_sentence) # print("\n") print("INPUT REQUIRED:", list_fin) # getBrokenTriplets(list_fin, sen) # print(tripletsADDON) return list_fin, sen
def getPhrases(ex, tagger): ex = ex.strip().strip('.').strip('!').replace('‘', '\'').replace( '’', '\'').replace('“', '"').replace('”', '"') sentence = Sentence(ex) tagger.predict(sentence) listchunked = sentence.to_tagged_string().split() ph = "" sentence = [] k = 0 while k + 1 < len(listchunked): if listchunked[k + 1] == '<S-NP>' or listchunked[ k + 1] == '<S-VP>' or listchunked[k + 1] == '<S-PP>': ph = listchunked[k] sentence.append([(ph), (listchunked[k + 1][-3:-1])]) ph = "" k += 2 # print("S") elif listchunked[k + 1] == '<S-ADJP>': ph = listchunked[k] sentence.append([(ph), ('NP')]) ph = "" k += 2 elif listchunked[k + 1] == '<S-PRT>': ph = listchunked[k] sentence.append([(ph), ('PP')]) ph = "" k += 2 elif listchunked[k + 1] == '<B-NP>': # ph = ph + listchunked[k] while (k + 1 < len(listchunked) and listchunked[k + 1] != '<E-NP>'): if not (listchunked[k][0] == '<' and listchunked[k][-1] == '>'): ph = ph.strip() + " " + listchunked[k] k += 1 ph = ph.strip() + " " + listchunked[k] sentence.append([ph, 'NP']) ph = "" k += 2 # print("BNP") elif listchunked[k + 1] == '<B-VP>': while (k + 1 < len(listchunked) and listchunked[k + 1] != '<E-VP>'): if not (listchunked[k][0] == '<' and listchunked[k][-1] == '>'): ph = ph.strip() + " " + listchunked[k] k += 1 ph = ph.strip() + " " + listchunked[k] sentence.append([ph, 'VP']) ph = "" k += 2 # print(BVP) elif listchunked[k + 1] == '<B-ADJP>': while (k + 1 < len(listchunked) and listchunked[k + 1] != '<E-ADJP>'): if not (listchunked[k][0] == '<' and listchunked[k][-1] == '>'): ph = ph.strip() + " " + listchunked[k] k += 1 ph = ph.strip() + " " + listchunked[k] sentence.append([ph, 'NP']) ph = "" k += 2 elif not (listchunked[k + 1][0] == '<' and listchunked[k + 1][-1] == '>'): #happens with 'CC' sentence.append([listchunked[k], 'CC']) k += 1 else: sentence.append(['REMOVE THIS', 'XX']) k += 2 # print("here") # print("temp chunks") # for x in sentence: # print(x) k = 0 while k < len(sentence): while (k < len(sentence) and sentence[k][0].find(',') != -1): index = sentence[k][0].find(',') if (sentence[k][0][index - 1] >= '0' and sentence[k][0][index - 1] <= '9' and index + 1 < len(sentence[k][0]) and sentence[k][0][index + 1] >= '0' and sentence[k][0][index + 1] <= '9'): break ph = list(sentence[k]) sentence[k][0] = sentence[k][0][:index] if (k + 1 >= len(sentence)): sentence = sentence + [[",", "CC"]] + ([[ ph[0][index + 1:].strip(), ph[1] ]] if ph[0][index + 1:].strip() != "" else []) else: sentence = sentence[:k + 1] + [[",", "CC"]] + ( [[ph[0][index + 1:].strip(), ph[1]]] if ph[0][index + 1:].strip() != "" else []) + sentence[k + 1:] k += 2 k += 1 k = 0 while k + 1 < len( sentence): #this loop merges consecutive PP like "than/PP in/PP" if (sentence[k][1] == "PP" and sentence[k + 1][1] == "PP"): sentence[k + 1][0] = sentence[k][0] + " " + sentence[k + 1][0] sentence = sentence[:k] + sentence[k + 1:] k -= 1 elif (sentence[k][1] == "NP" and (sentence[k + 1][1] == "NP" or sentence[k + 1][0] == "era")): sentence[k + 1][0] = sentence[k][0] + " " + sentence[k + 1][0] sentence = sentence[:k] + sentence[k + 1:] k -= 1 k += 1 k = 0 while k < len(sentence): if (sentence[k][1] == "NP"): if (sentence[k][0].find("and ") == 0): sentence[k][0] = sentence[k][0].replace("and ", "") if ("it's " in sentence[k][0]): sentence[k][0] = sentence[k][0].replace("it's ", "") k += 1 k = 0 while k < len(sentence): if (len(sentence[k][0]) == 0 or sentence[k][1] == "XX"): sentence = sentence[:k] + (sentence[k + 1:] if k + 1 < len(sentence) else []) k += 1 if len(sentence) > 0 and sentence[-1][1] == "CC": sentence = sentence[:-1] # k=0 # while k<len(sentence): #this loop was designed to change possesion tags to "of" PP # if("'s " in sentence[k][0] and sentence[k][1] == "NP"): # index = sentence[k][0].find("'s ") # ph = sentence[k][0] # sentence[k][0] = sentence[k][0][index+2:].strip() # ph = ph[:index].strip() # sentence = sentence[:k+1] +[["of","PP"],[ph,"NP"]] + sentence[k+1:] # k+=1 return sentence