Exemple #1
0
def find_semantic_distance(wordtag1, wordtag2):
    word1, tag1 = wordtag1
    word2, tag2 = wordtag2

    if tag1.startswith('NN') and tag2.startswith('NN'):
        if tag1 == 'NNS':
            word1 = singularize(word1)
        if tag2 == 'NNS':
            word2 = singularize(word2)
        try:
            syn1 = wn.synset('{}.n.01'.format(word1))
            syn2 = wn.synset('{}.n.01'.format(word2))
        except nltk.corpus.reader.wordnet.WordNetError:
            return None
        return syn1.path_similarity(syn2), syn1.wup_similarity(syn2)

    if tag1.startswith('VB') and tag2.startswith('VB'):
        try:
            word1 = conjugate(word1, 'inf')
        except RuntimeError:
            pass
        try:
            word2 = conjugate(word2, 'inf')
        except RuntimeError:
            pass
        try:
            syn1 = wn.synset('{}.v.01'.format(word1))
            syn2 = wn.synset('{}.v.01'.format(word2))
        except nltk.corpus.reader.wordnet.WordNetError:
            return None
        return syn1.path_similarity(syn2), syn1.wup_similarity(syn2)

    return None
def check_animal_in_sentence(animals_set, data, per_animals_list, sentence,
                             complex_sentence_list, link, paragraph):
    animals_in_sentence_set = set()

    for word in sentence.split(" "):
        if word in animals_set:

            singular_animal = singularize(word)

            if singular_animal in animals_set:
                animals_in_sentence_set.add(singular_animal)
            else:
                animals_in_sentence_set.add(word)

    if len(animals_in_sentence_set) < 2:
        return False, False
    if len(animals_in_sentence_set) == 2:
        per_animals_list.append(
            (animals_in_sentence_set.pop(), animals_in_sentence_set.pop()))
        data.append(sentence + "|" + link + "|" + paragraph)
        return True, True

    complex_sentence_list.append(
        (animals_in_sentence_set, sentence + "|" + link + "|" + paragraph))
    return True, False
Exemple #3
0
def change_type(word, tag, change_prob):
    global PREPOSITIONS, VERB_TYPES
    if tag == "IN":
        if random.random() < change_prob:
            word = random.choice(PREPOSITIONS)
    elif tag == "NN":
        if random.random() < change_prob:
            word = pluralize(word)
    elif tag == "NNS":
        if random.random() < change_prob:
            word = singularize(word)
    elif "VB" in tag:
        if random.random() < change_prob:
            verb_type = random.choice(VERB_TYPES)
            word = conjugate(word, verb_type)
    return word
def cleaner(uncleaned):
    porter = PorterStemmer()
    lemma = WordNetLemmatizer()
    uncleaned = uncleaned.lower()
    uncleaned = singularize(uncleaned)
    uncleaned = re.sub(r'ly$', r'', uncleaned)
    uncleaned = re.sub(r'ed$', r'', uncleaned)
    uncleaned = re.sub(r'ing$', r'', uncleaned)
    uncleaned = re.sub(r'nes$', r'', uncleaned)
    # print(uncleaned)
    # uncleaned = uncleaned.strip()
    # uncleaned = uncleaned.translate({ord(i): None for i in '!\\@#-_:$%^&*();.,?/1”2’3“4‘567890\'\"'})
    # for i in uncleaned:
    #     t = ord(i)
    #     if t < 97 or t>122:
    #         uncleaned = uncleaned.replace(i, "")
    # uncleaned = lemma.lemmatize(uncleaned)
    # uncleaned = porter.stem(uncleaned)
    # print(uncleaned)
    return uncleaned
Exemple #5
0
 def caption_to_words(self, caption):
 
     '''
     Input: caption
     Output: MSCOCO words in the caption
     '''
 
     #standard preprocessing
     words = nltk.word_tokenize(caption.lower())
     words = [singularize(w) for w in words]
 
     #replace double words
     i = 0
     double_words = []
     idxs = []
     while i < len(words):
        idxs.append(i) 
        double_word = ' '.join(words[i:i+2])
        if double_word in self.double_word_dict: 
            double_words.append(self.double_word_dict[double_word])
            i += 2
        else:
            double_words.append(words[i])
            i += 1
     words = double_words
 
     #toilet seat is not chair (sentences like "the seat of the toilet" will fire for "chair" if we do not include this line)
     if ('toilet' in words) & ('seat' in words): words = [word for word in words if word != 'seat']
 
     #get synonyms for all words in the caption
     idxs = [idxs[idx] for idx, word in enumerate(words) \
             if word in set(self.mscoco_objects)]
     words = [word for word in words if word in set(self.mscoco_objects)]
     node_words = []
     for word in words:
         node_words.append(self.inverse_synonym_dict[word])
     #return all the MSCOCO objects in the caption
     return words, node_words, idxs, double_words
def SingularPlural(word,num):

    try:
        num=int(num)
        if num==-1 or num==1:
            w=singularize(word)
        else:
            w=pluralize(word)
        print(w)

    except ValueError:
      if type(num) is str:
        if word==singularize(num):
            print("Singular: ",singularize(word))
            print("Plural: ",pluralize(word))
        elif num==singularize(word):
            print("Singular: ",singularize(num))
            print("Plural: ",pluralize(num))
        elif singularize(word)==singularize(num):
            print("Singular: ",word)
            print("Plural: ",pluralize(word))
        else:
            print("Invalid Input")
def my_singularize(word):
    if (word in ["hers", "his", "theirs"]): return word
    return singularize(word)