def lemmatize(words): roots = list() lemmatizer = WordNetLemmatizer() for word in words: tag = word[1] wordnet_tag = get_wordnet_pos(tag) inflection = word[0] if wordnet_tag == '': root = inflection else: root = lemmatizer.lemmatize(inflection, get_wordnet_pos(tag)) roots.append(root) return roots
def get_sentiment_from_text_sentiwordnet(self, text): tokens = nltk.word_tokenize(text) tokens_tagged = nltk.pos_tag(tokens) total_sentiment = np.zeros(self.get_lexicon_shape()) for tagged_pair in tokens_tagged: word, pos = tagged_pair[0], get_wordnet_pos(tagged_pair[1]) if pos != '': synset = lesk(tokens, word, pos) if synset is not None: try: total_sentiment += self.get_sentiment_from_lexicon(synset.name()) except KeyError: pass self.logger.debug('Text: ' + text) tokens = self.preprocess(text, join_negatives=False) self.logger.debug('Tokens: ' + str(tokens)) return total_sentiment[:2], tokens
def load_and_clean_data(self): all_sentences, named_entities = [], [] with open(self.file_path) as train_file: words, entities, unique_entities = [], [], set() for line in islice(train_file, 2, None): # read the file from 2nd line word = line.split(' ')[0] named_entity = line.split(' ')[-1].strip('\n') word = self.clean_word(word) if line in ('\n', '\r\n'): # end of a sentence all_sentences.append(' '.join(words)) named_entities.append(' '.join(entities)) unique_entities |= set(entities) words, entities = [], [] else: if word: # Performing Word Lemmatization on text word_lemmatizer = WordNetLemmatizer() word, typ = nltk.pos_tag(word_tokenize(word))[0] typ = get_wordnet_pos(typ) if typ: lemmatized_word = word_lemmatizer.lemmatize( word, typ) else: lemmatized_word = word_lemmatizer.lemmatize(word) words.append(lemmatized_word) entities.append(named_entity) self.n_entities = len(unique_entities) self.entity_to_index = {t: i for i, t in enumerate(unique_entities)} self.df = pd.DataFrame( data={ DataFrameHeaders.sentence: all_sentences, DataFrameHeaders.named_entity: named_entities })
print("Total articles: " + str(articles_count)) allwords_set = set() print('Removing stopwords...') data = removeStopWords(data) # Lemmatisation of words print('Lemmatization...') lemmatizer = WordNetLemmatizer() for article in data["articles"]: for text_id in article: # for every word in every article for word in article[str(text_id)]: word["word"] = lemmatizer.lemmatize( word["word"], utils.get_wordnet_pos(word["pos_tag"])) allwords_set.add(word["word"]) all_unique_words_count = len(allwords_set) print("Total number of unique words: " + str(all_unique_words_count)) # calculate tf for every lemma and add it to set_tf print('Calculating tf ') set_tf = {} # set_tf will contain as key the "article_id:word" ex: "12:start" and as value the tf of this word in the specified article # in this way, we can use get_tf_of_word_from_article function which will return the tf of a given word for a given article immediately, without any loop. for article in data["articles"]: for text_id in article: # for every word in every article for word in article[str(text_id)]: key = str(text_id)+":"+word["word"]
for word in sent] # this doesn't work, a little salty about it #examples_words = [word for word_tokenize(sent) in idiom_examples \ # for word in word_tokenize(sent)] # Does it matter that the last sentence of one document will be combined with # the first sentence of another? words = brown.words() + gutenberg.words() + reuters.words() + examples_words print("{}: Lowercasing all the words...".format(datetime.now())) words_lower = [w.lower() for w in words] print("{}: Lemmatizing all the words...".format(datetime.now())) wnlt = nltk.WordNetLemmatizer() words_lemmatized = [wnlt.lemmatize(word, get_wordnet_pos(tb_pos)) \ for word,tb_pos in nltk.pos_tag(words_lower)] #bigrams = nltk.collocations.BigramCollocationFinder.from_words( # words, # window_size=20) #bigrams.apply_freq_filter(20) #bigrams_freq = bigrams.ngram_fd ### TODO(?) : Try different windowsizes print("{}: Creating bigrams frequencies and storing results...".format( datetime.now())) bigrams_freq = BigramCollocationFinder.from_words(words_lemmatized, window_size=20).ngram_fd
jsonObj = loads(dumps(badgerfish.data(fromstring(xmlstr)))) # lowercase the search query for i in range(0, len(args.words)): args.words[i] = args.words[i].lower() # add tags and remove stopWords wordsWithTagsAndStopWords = nltk.pos_tag(args.words) wordsWithTags = utils.removeStopWordsFromListOfWords(wordsWithTagsAndStopWords) # leammatize words and remove the tag, in order to be ready for query lemmatizer = WordNetLemmatizer() words = [] for i in range(0, len(wordsWithTags)): words.append(lemmatizer.lemmatize( wordsWithTags[i][0], utils.get_wordnet_pos(wordsWithTags[i][1]))) # get the articles containing the words of the query and add weights if articles contain multiple words of the query docsContainingRequestedWords = getListOfDocsWithWeightsForWords(words) finalListWithIdsAfterQuery = getSumOfWeightsForArticlesWithSameWords( docsContainingRequestedWords) # display results if len(finalListWithIdsAfterQuery) > 0: conn = sqlite3.connect(str(Path(__file__).parent) + '/database/crawler_db.sqlite') cursor = conn.cursor() finalListWithIdsAfterQuery.sort(key=lambda x: x["@weight"], reverse=True) if(args.limit == None): # if no limit specified then display all relevant articles