def save_sentiment_dict(remove_neutral=True):
    """
    This saves the sentiment dictionary without extended adverbs (check).
    No get additional adverbs, modify the load function in the Sentiment class of Pattern.
    """
    # this must be the path of the Pattern sentiment lexicon for English
    pin = 'C:/Users/ABittar/AppData/Local/Continuum/anaconda3/envs/py27/lib/site-packages/pattern-2.6-py2.7.egg/pattern/text/en/en-sentiment.xml'
    s = Sentiment()
    s.load(pin)
    s = dict(s)
    df_lex = pd.DataFrame.from_dict(s).T

    # keep only polarity value (first element in list, or 0 if null)
    for col in df_lex.columns:
        df_lex[col] = df_lex[[col]][col].apply(
            lambda x: x[0] if (type(x) == list or type(x) == tuple) else 0)

    # remove rows/words with 0 values
    if remove_neutral:
        df_lex = df_lex.loc[~(df_lex[None] == 0)]
        df_lex.drop('', inplace=True, axis=1)

    # > 0 = pos (1), <0 = neg (-1), 0 = neutral
    for col in df_lex.columns:
        df_lex[col] = df_lex[[col]][col].apply(lambda x: 1 if x > 0 else x)

    for col in df_lex.columns:
        df_lex[col] = df_lex[[col]][col].apply(lambda x: -1 if x < 0 else x)

    df_lex.rename(columns={None: 'aggregated'}, inplace=True)

    df_lex.to_pickle(RESOURCE_DIR + 'pattern_en_sentiment_full.pickle')

    return df_lex
Example #2
0
 def load(self, path=None):
     _Sentiment.load(self, path)
     # Map "verschrikkelijk" to adverbial "verschrikkelijke" (+1%)
     if not path:
         for w, pos in dict.items(self):
             if "JJ" in pos:
                 p, s, i = pos["JJ"]
                 self.annotate(attributive(w), "JJ", p, s, i)
Example #3
0
    def __init__(self, path="SentiWordNet*.txt", language="en"):
        """A sentiment lexicon with scores from SentiWordNet.

        The value for each word is a tuple with values for
        polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0).

        """
        Sentiment.__init__(self, path=path, language=language)
Example #4
0
    def __init__(self, path="SentiWordNet*.txt", language="en"):
        """A sentiment lexicon with scores from SentiWordNet.

        The value for each word is a tuple with values for
        polarity (-1.0-1.0), subjectivity (0.0-1.0) and intensity (0.5-2.0).

        """
        Sentiment.__init__(self, path=path, language=language)
Example #5
0
 def load(self):
     _Sentiment.load(self)
     for w, pos in self.items():
         if "JJ" in pos:
             # Map "verschrikkelijk" to adverbial "verschrikkelijke" (+1% accuracy)
             self.setdefault(attributive(w), {
                 "JJ": pos["JJ"],
                 None: pos["JJ"]
             })
Example #6
0
 def load(self, path=None):
     _Sentiment.load(self, path)
     # Map "terrible" to adverb "terribly" (+1% accuracy)
     if not path:
         for w, pos in list(dict.items(self)):
             if "JJ" in pos:
                 if w.endswith("y"):
                     w = w[:-1] + "i"
                 if w.endswith("le"):
                     w = w[:-2]
                 p, s, i = pos["JJ"]
                 self.annotate(w + "ly", "RB", p, s, i)
Example #7
0
 def load(self, path=None):
     _Sentiment.load(self, path)
     # Map "terrible" to adverb "terribly" (+1% accuracy)
     if not path:
         for w, pos in list(dict.items(self)):
             if "JJ" in pos:
                 if w.endswith("y"):
                     w = w[:-1] + "i"
                 if w.endswith("le"):
                     w = w[:-2]
                 p, s, i = pos["JJ"]
                 self.annotate(w + "ly", "RB", p, s, i)
Example #8
0
 def load(self, path=None):
     _Sentiment.load(self, path)
     # Map "précaire" to "precaire" (without diacritics, +1% accuracy).
     if not path:
         for w, pos in list(dict.items(self)):
             w0 = w
             if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
                 w = w.replace(u"à", "a")
                 w = w.replace(u"é", "e")
                 w = w.replace(u"è", "e")
                 w = w.replace(u"ê", "e")
                 w = w.replace(u"ï", "i")
             if w != w0:
                 for pos, (p, s, i) in pos.items():
                     self.annotate(w, pos, p, s, i)
Example #9
0
 def load(self, path=None):
     _Sentiment.load(self, path)
     # Map "précaire" to "precaire" (without diacritics, +1% accuracy).
     if not path:
         for w, pos in dict.items(self):
             w0 = w
             if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
                 w = w.replace(u"à", "a")
                 w = w.replace(u"é", "e")
                 w = w.replace(u"è", "e")
                 w = w.replace(u"ê", "e")
                 w = w.replace(u"ï", "i")
             if w != w0:
                 for pos, (p, s, i) in pos.items():
                     self.annotate(w, pos, p, s, i)
Example #10
0
                entities=os.path.join(MODULE, "xx-entities.txt"),
                default=("NN", "NNP", "CD"),
                language="xx")

lexicon = parser.lexicon  # Expose lexicon.

# Create the sentiment lexicon,
# see pattern/text/xx/xx-sentiment.xml for further details.
# We also need to define the tag for modifiers,
# words that modify the score of the following word
# (e.g., *very* good, *not good, ...)

sentiment = Sentiment(
    path=os.path.join(MODULE, "xx-sentiment.xml"),
    synset=None,
    negations=("no", "not", "never"),
    modifiers=("RB", ),
    modifier=lambda w: w.endswith("ly"),  # brilliantly, hardly, partially, ...
    language="xx")

# Nothing should be changed below.


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
Example #11
0
 def load(self):
     _Sentiment.load(self)
     for w, pos in self.items():
         if "JJ" in pos:
             # Map "verschrikkelijk" to adverbial "verschrikkelijke" (+1% accuracy)
             self.setdefault(attributive(w), {"JJ": pos["JJ"], None: pos["JJ"]})
Example #12
0
                "map", lambda token, tag: penntreebank2universal(token, tag))
        return _Parser.find_tags(self, tokens, **kwargs)


lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"),
                  morphology=os.path.join(MODULE, "en-morphology.txt"),
                  context=os.path.join(MODULE, "en-context.txt"),
                  entities=os.path.join(MODULE, "en-entities.txt"),
                  language="en")

parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en")

sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"),
                      synset="wordnet_id",
                      negations=("no", "not", "n't", "never"),
                      modifiers=("RB", ),
                      modifier=lambda w: w.endswith("ly"),
                      tokenizer=parser.find_tokens,
                      language="en")

spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt"))


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
Example #13
0
 def load(self, path=None):
     _Sentiment.load(self, path)
Example #14
0
 def get(self, k, *args, **kwargs):
     return Sentiment.get(self, normalize(k), *args, **kwargs)
Example #15
0
 def __getitem__(self, k):
     return Sentiment.__getitem__(self, normalize(k))
Example #16
0
        ]
        return s

    def find_lemmata(self, tokens, **kwargs):
        return find_lemmata(tokens)


lexicon = Lexicon(path=os.path.join(MODULE, "fr-lexicon.txt"),
                  morphology=os.path.join(MODULE, "fr-morphology.txt"),
                  context=os.path.join(MODULE, "fr-context.txt"),
                  language="fr")
parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="fr")
sentiment = Sentiment(path=os.path.join(MODULE, "fr-sentiment.xml"),
                      synset=None,
                      negations=("ne", "ni", "non", "pas", "rien", "sans",
                                 "aucun", "jamais"),
                      modifiers=("RB", ),
                      modifier=lambda w: w.endswith("ment"),
                      language="fr")


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)
Example #17
0
    return tokens


class Parser(_Parser):
    def find_lemmata(self, tokens, **kwargs):
        return find_lemmata(tokens)


lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"),
                  morphology=os.path.join(MODULE, "en-morphology.txt"),
                  context=os.path.join(MODULE, "en-context.txt"),
                  entities=os.path.join(MODULE, "en-entities.txt"),
                  language="en")
parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en")
sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"),
                      synset="wordnet_id",
                      language="en")
spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt"))


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)
Example #18
0
 def load(self, path=None):
     _Sentiment.load(self, path)