def find_semantic_distance(wordtag1, wordtag2): word1, tag1 = wordtag1 word2, tag2 = wordtag2 if tag1.startswith('NN') and tag2.startswith('NN'): if tag1 == 'NNS': word1 = singularize(word1) if tag2 == 'NNS': word2 = singularize(word2) try: syn1 = wn.synset('{}.n.01'.format(word1)) syn2 = wn.synset('{}.n.01'.format(word2)) except nltk.corpus.reader.wordnet.WordNetError: return None return syn1.path_similarity(syn2), syn1.wup_similarity(syn2) if tag1.startswith('VB') and tag2.startswith('VB'): try: word1 = conjugate(word1, 'inf') except RuntimeError: pass try: word2 = conjugate(word2, 'inf') except RuntimeError: pass try: syn1 = wn.synset('{}.v.01'.format(word1)) syn2 = wn.synset('{}.v.01'.format(word2)) except nltk.corpus.reader.wordnet.WordNetError: return None return syn1.path_similarity(syn2), syn1.wup_similarity(syn2) return None
def check_animal_in_sentence(animals_set, data, per_animals_list, sentence, complex_sentence_list, link, paragraph): animals_in_sentence_set = set() for word in sentence.split(" "): if word in animals_set: singular_animal = singularize(word) if singular_animal in animals_set: animals_in_sentence_set.add(singular_animal) else: animals_in_sentence_set.add(word) if len(animals_in_sentence_set) < 2: return False, False if len(animals_in_sentence_set) == 2: per_animals_list.append( (animals_in_sentence_set.pop(), animals_in_sentence_set.pop())) data.append(sentence + "|" + link + "|" + paragraph) return True, True complex_sentence_list.append( (animals_in_sentence_set, sentence + "|" + link + "|" + paragraph)) return True, False
def change_type(word, tag, change_prob): global PREPOSITIONS, VERB_TYPES if tag == "IN": if random.random() < change_prob: word = random.choice(PREPOSITIONS) elif tag == "NN": if random.random() < change_prob: word = pluralize(word) elif tag == "NNS": if random.random() < change_prob: word = singularize(word) elif "VB" in tag: if random.random() < change_prob: verb_type = random.choice(VERB_TYPES) word = conjugate(word, verb_type) return word
def cleaner(uncleaned): porter = PorterStemmer() lemma = WordNetLemmatizer() uncleaned = uncleaned.lower() uncleaned = singularize(uncleaned) uncleaned = re.sub(r'ly$', r'', uncleaned) uncleaned = re.sub(r'ed$', r'', uncleaned) uncleaned = re.sub(r'ing$', r'', uncleaned) uncleaned = re.sub(r'nes$', r'', uncleaned) # print(uncleaned) # uncleaned = uncleaned.strip() # uncleaned = uncleaned.translate({ord(i): None for i in '!\\@#-_:$%^&*();.,?/1”2’3“4‘567890\'\"'}) # for i in uncleaned: # t = ord(i) # if t < 97 or t>122: # uncleaned = uncleaned.replace(i, "") # uncleaned = lemma.lemmatize(uncleaned) # uncleaned = porter.stem(uncleaned) # print(uncleaned) return uncleaned
def caption_to_words(self, caption): ''' Input: caption Output: MSCOCO words in the caption ''' #standard preprocessing words = nltk.word_tokenize(caption.lower()) words = [singularize(w) for w in words] #replace double words i = 0 double_words = [] idxs = [] while i < len(words): idxs.append(i) double_word = ' '.join(words[i:i+2]) if double_word in self.double_word_dict: double_words.append(self.double_word_dict[double_word]) i += 2 else: double_words.append(words[i]) i += 1 words = double_words #toilet seat is not chair (sentences like "the seat of the toilet" will fire for "chair" if we do not include this line) if ('toilet' in words) & ('seat' in words): words = [word for word in words if word != 'seat'] #get synonyms for all words in the caption idxs = [idxs[idx] for idx, word in enumerate(words) \ if word in set(self.mscoco_objects)] words = [word for word in words if word in set(self.mscoco_objects)] node_words = [] for word in words: node_words.append(self.inverse_synonym_dict[word]) #return all the MSCOCO objects in the caption return words, node_words, idxs, double_words
def SingularPlural(word,num): try: num=int(num) if num==-1 or num==1: w=singularize(word) else: w=pluralize(word) print(w) except ValueError: if type(num) is str: if word==singularize(num): print("Singular: ",singularize(word)) print("Plural: ",pluralize(word)) elif num==singularize(word): print("Singular: ",singularize(num)) print("Plural: ",pluralize(num)) elif singularize(word)==singularize(num): print("Singular: ",word) print("Plural: ",pluralize(word)) else: print("Invalid Input")
def my_singularize(word): if (word in ["hers", "his", "theirs"]): return word return singularize(word)