def save_sentiment_dict(remove_neutral=True): """ This saves the sentiment dictionary without extended adverbs (check). No get additional adverbs, modify the load function in the Sentiment class of Pattern. """ # this must be the path of the Pattern sentiment lexicon for English pin = 'C:/Users/ABittar/AppData/Local/Continuum/anaconda3/envs/py27/lib/site-packages/pattern-2.6-py2.7.egg/pattern/text/en/en-sentiment.xml' s = Sentiment() s.load(pin) s = dict(s) df_lex = pd.DataFrame.from_dict(s).T # keep only polarity value (first element in list, or 0 if null) for col in df_lex.columns: df_lex[col] = df_lex[[col]][col].apply( lambda x: x[0] if (type(x) == list or type(x) == tuple) else 0) # remove rows/words with 0 values if remove_neutral: df_lex = df_lex.loc[~(df_lex[None] == 0)] df_lex.drop('', inplace=True, axis=1) # > 0 = pos (1), <0 = neg (-1), 0 = neutral for col in df_lex.columns: df_lex[col] = df_lex[[col]][col].apply(lambda x: 1 if x > 0 else x) for col in df_lex.columns: df_lex[col] = df_lex[[col]][col].apply(lambda x: -1 if x < 0 else x) df_lex.rename(columns={None: 'aggregated'}, inplace=True) df_lex.to_pickle(RESOURCE_DIR + 'pattern_en_sentiment_full.pickle') return df_lex
def load(self, path=None): _Sentiment.load(self, path) # Map "verschrikkelijk" to adverbial "verschrikkelijke" (+1%) if not path: for w, pos in dict.items(self): if "JJ" in pos: p, s, i = pos["JJ"] self.annotate(attributive(w), "JJ", p, s, i)
def __init__(self, path="SentiWordNet*.txt", language="en"): """A sentiment lexicon with scores from SentiWordNet. The value for each word is a tuple with values for polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0). """ Sentiment.__init__(self, path=path, language=language)
def load(self): _Sentiment.load(self) for w, pos in self.items(): if "JJ" in pos: # Map "verschrikkelijk" to adverbial "verschrikkelijke" (+1% accuracy) self.setdefault(attributive(w), { "JJ": pos["JJ"], None: pos["JJ"] })
def load(self, path=None): _Sentiment.load(self, path) # Map "terrible" to adverb "terribly" (+1% accuracy) if not path: for w, pos in list(dict.items(self)): if "JJ" in pos: if w.endswith("y"): w = w[:-1] + "i" if w.endswith("le"): w = w[:-2] p, s, i = pos["JJ"] self.annotate(w + "ly", "RB", p, s, i)
def load(self, path=None): _Sentiment.load(self, path) # Map "précaire" to "precaire" (without diacritics, +1% accuracy). if not path: for w, pos in list(dict.items(self)): w0 = w if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")): w = w.replace(u"à", "a") w = w.replace(u"é", "e") w = w.replace(u"è", "e") w = w.replace(u"ê", "e") w = w.replace(u"ï", "i") if w != w0: for pos, (p, s, i) in pos.items(): self.annotate(w, pos, p, s, i)
def load(self, path=None): _Sentiment.load(self, path) # Map "précaire" to "precaire" (without diacritics, +1% accuracy). if not path: for w, pos in dict.items(self): w0 = w if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")): w = w.replace(u"à", "a") w = w.replace(u"é", "e") w = w.replace(u"è", "e") w = w.replace(u"ê", "e") w = w.replace(u"ï", "i") if w != w0: for pos, (p, s, i) in pos.items(): self.annotate(w, pos, p, s, i)
entities=os.path.join(MODULE, "xx-entities.txt"), default=("NN", "NNP", "CD"), language="xx") lexicon = parser.lexicon # Expose lexicon. # Create the sentiment lexicon, # see pattern/text/xx/xx-sentiment.xml for further details. # We also need to define the tag for modifiers, # words that modify the score of the following word # (e.g., *very* good, *not good, ...) sentiment = Sentiment( path=os.path.join(MODULE, "xx-sentiment.xml"), synset=None, negations=("no", "not", "never"), modifiers=("RB", ), modifier=lambda w: w.endswith("ly"), # brilliantly, hardly, partially, ... language="xx") # Nothing should be changed below. def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string.
def load(self): _Sentiment.load(self) for w, pos in self.items(): if "JJ" in pos: # Map "verschrikkelijk" to adverbial "verschrikkelijke" (+1% accuracy) self.setdefault(attributive(w), {"JJ": pos["JJ"], None: pos["JJ"]})
"map", lambda token, tag: penntreebank2universal(token, tag)) return _Parser.find_tags(self, tokens, **kwargs) lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"), morphology=os.path.join(MODULE, "en-morphology.txt"), context=os.path.join(MODULE, "en-context.txt"), entities=os.path.join(MODULE, "en-entities.txt"), language="en") parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en") sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"), synset="wordnet_id", negations=("no", "not", "n't", "never"), modifiers=("RB", ), modifier=lambda w: w.endswith("ly"), tokenizer=parser.find_tokens, language="en") spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt")) def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string.
def load(self, path=None): _Sentiment.load(self, path)
def get(self, k, *args, **kwargs): return Sentiment.get(self, normalize(k), *args, **kwargs)
def __getitem__(self, k): return Sentiment.__getitem__(self, normalize(k))
] return s def find_lemmata(self, tokens, **kwargs): return find_lemmata(tokens) lexicon = Lexicon(path=os.path.join(MODULE, "fr-lexicon.txt"), morphology=os.path.join(MODULE, "fr-morphology.txt"), context=os.path.join(MODULE, "fr-context.txt"), language="fr") parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="fr") sentiment = Sentiment(path=os.path.join(MODULE, "fr-sentiment.xml"), synset=None, negations=("ne", "ni", "non", "pas", "rien", "sans", "aucun", "jamais"), modifiers=("RB", ), modifier=lambda w: w.endswith("ment"), language="fr") def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string. """ return parser.parse(s, *args, **kwargs)
return tokens class Parser(_Parser): def find_lemmata(self, tokens, **kwargs): return find_lemmata(tokens) lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"), morphology=os.path.join(MODULE, "en-morphology.txt"), context=os.path.join(MODULE, "en-context.txt"), entities=os.path.join(MODULE, "en-entities.txt"), language="en") parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en") sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"), synset="wordnet_id", language="en") spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt")) def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string. """ return parser.parse(s, *args, **kwargs)