def processQueryToDoImprovedLemmatization(self, posTags): lemmas = [] wnl = WordNetLemmatizer() ic = IndexCreation() for word, tag in posTags: wnTag = ic.getWordnetTag(tag) if wnTag is None: lemmas.append(wnl.lemmatize(word)) else: lemmas.append(wnl.lemmatize(word, pos=wnTag)) return lemmas
def getArticleAndWordCount(self, path): print("Number of articles:", str(len(os.listdir(path)))) indexSentenceMap = collections.OrderedDict() wordCount = 0 ic = IndexCreation() data = ic.readArticles(path) data = ic.removeArticleTitle(data) for i in range(0, len(data)): for j in range(0, len(data[i])): tokenizedWords = word_tokenize(data[i][j]) index = 'A' + str(i + 1) + 'S' + str(j + 1) indexSentenceMap[index] = data[i][j] wordCount += len(tokenizedWords) print("Number of words in the corpus:", str(wordCount)) return indexSentenceMap
def processQueryToExtractImprovisedHolonyms(self, posTags): holonyms = [] for word, tag in posTags: wnTag = IndexCreation().getWordnetTag(tag) if wnTag is not None: synset = wn.synsets(word, pos=wnTag) else: synset = wn.synsets(word) if len(synset) > 0: if len(synset[0].part_holonyms()) > 0: holonyms.append( synset[0].part_holonyms()[0].name().split('.')[0]) return holonyms
def processQueryToExtractImprovisedHeadWord(self, query): dependency_parser = CoreNLPDependencyParser('http://localhost:9000') headWord = None parsedSentence = list(dependency_parser.raw_parse(query))[0] rootValue = list( list(parsedSentence.nodes.values())[0]['deps']['ROOT'])[0] for n in parsedSentence.nodes.values(): if n['address'] == rootValue: headWord = n['word'] if len(headWord): _, tag = pos_tag([headWord])[0] wnTag = IndexCreation().getWordnetTag(tag) if wnTag is not None: synset = wn.synsets(headWord, pos=wnTag) if len(synset) > 0: headWord = synset[0].name().split('.')[0] break return headWord