def save_sentiment_dict(remove_neutral=True): """ This saves the sentiment dictionary without extended adverbs (check). No get additional adverbs, modify the load function in the Sentiment class of Pattern. """ # this must be the path of the Pattern sentiment lexicon for English pin = 'C:/Users/ABittar/AppData/Local/Continuum/anaconda3/envs/py27/lib/site-packages/pattern-2.6-py2.7.egg/pattern/text/en/en-sentiment.xml' s = Sentiment() s.load(pin) s = dict(s) df_lex = pd.DataFrame.from_dict(s).T # keep only polarity value (first element in list, or 0 if null) for col in df_lex.columns: df_lex[col] = df_lex[[col]][col].apply( lambda x: x[0] if (type(x) == list or type(x) == tuple) else 0) # remove rows/words with 0 values if remove_neutral: df_lex = df_lex.loc[~(df_lex[None] == 0)] df_lex.drop('', inplace=True, axis=1) # > 0 = pos (1), <0 = neg (-1), 0 = neutral for col in df_lex.columns: df_lex[col] = df_lex[[col]][col].apply(lambda x: 1 if x > 0 else x) for col in df_lex.columns: df_lex[col] = df_lex[[col]][col].apply(lambda x: -1 if x < 0 else x) df_lex.rename(columns={None: 'aggregated'}, inplace=True) df_lex.to_pickle(RESOURCE_DIR + 'pattern_en_sentiment_full.pickle') return df_lex
entities=os.path.join(MODULE, "xx-entities.txt"), default=("NN", "NNP", "CD"), language="xx") lexicon = parser.lexicon # Expose lexicon. # Create the sentiment lexicon, # see pattern/text/xx/xx-sentiment.xml for further details. # We also need to define the tag for modifiers, # words that modify the score of the following word # (e.g., *very* good, *not good, ...) sentiment = Sentiment( path=os.path.join(MODULE, "xx-sentiment.xml"), synset=None, negations=("no", "not", "never"), modifiers=("RB", ), modifier=lambda w: w.endswith("ly"), # brilliantly, hardly, partially, ... language="xx") # Nothing should be changed below. def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string.
return tokens class Parser(_Parser): def find_lemmata(self, tokens, **kwargs): return find_lemmata(tokens) lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"), morphology=os.path.join(MODULE, "en-morphology.txt"), context=os.path.join(MODULE, "en-context.txt"), entities=os.path.join(MODULE, "en-entities.txt"), language="en") parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en") sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"), synset="wordnet_id", language="en") spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt")) def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string. """ return parser.parse(s, *args, **kwargs)
] return s def find_lemmata(self, tokens, **kwargs): return find_lemmata(tokens) lexicon = Lexicon(path=os.path.join(MODULE, "fr-lexicon.txt"), morphology=os.path.join(MODULE, "fr-morphology.txt"), context=os.path.join(MODULE, "fr-context.txt"), language="fr") parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="fr") sentiment = Sentiment(path=os.path.join(MODULE, "fr-sentiment.xml"), synset=None, negations=("ne", "ni", "non", "pas", "rien", "sans", "aucun", "jamais"), modifiers=("RB", ), modifier=lambda w: w.endswith("ment"), language="fr") def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string. """ return parser.parse(s, *args, **kwargs)
"map", lambda token, tag: penntreebank2universal(token, tag)) return _Parser.find_tags(self, tokens, **kwargs) lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"), morphology=os.path.join(MODULE, "en-morphology.txt"), context=os.path.join(MODULE, "en-context.txt"), entities=os.path.join(MODULE, "en-entities.txt"), language="en") parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en") sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"), synset="wordnet_id", negations=("no", "not", "n't", "never"), modifiers=("RB", ), modifier=lambda w: w.endswith("ly"), tokenizer=parser.find_tokens, language="en") spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt")) def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string.