def save_sentiment_dict(remove_neutral=True):
    """
    This saves the sentiment dictionary without extended adverbs (check).
    No get additional adverbs, modify the load function in the Sentiment class of Pattern.
    """
    # this must be the path of the Pattern sentiment lexicon for English
    pin = 'C:/Users/ABittar/AppData/Local/Continuum/anaconda3/envs/py27/lib/site-packages/pattern-2.6-py2.7.egg/pattern/text/en/en-sentiment.xml'
    s = Sentiment()
    s.load(pin)
    s = dict(s)
    df_lex = pd.DataFrame.from_dict(s).T

    # keep only polarity value (first element in list, or 0 if null)
    for col in df_lex.columns:
        df_lex[col] = df_lex[[col]][col].apply(
            lambda x: x[0] if (type(x) == list or type(x) == tuple) else 0)

    # remove rows/words with 0 values
    if remove_neutral:
        df_lex = df_lex.loc[~(df_lex[None] == 0)]
        df_lex.drop('', inplace=True, axis=1)

    # > 0 = pos (1), <0 = neg (-1), 0 = neutral
    for col in df_lex.columns:
        df_lex[col] = df_lex[[col]][col].apply(lambda x: 1 if x > 0 else x)

    for col in df_lex.columns:
        df_lex[col] = df_lex[[col]][col].apply(lambda x: -1 if x < 0 else x)

    df_lex.rename(columns={None: 'aggregated'}, inplace=True)

    df_lex.to_pickle(RESOURCE_DIR + 'pattern_en_sentiment_full.pickle')

    return df_lex
Exemple #2
0
                entities=os.path.join(MODULE, "xx-entities.txt"),
                default=("NN", "NNP", "CD"),
                language="xx")

lexicon = parser.lexicon  # Expose lexicon.

# Create the sentiment lexicon,
# see pattern/text/xx/xx-sentiment.xml for further details.
# We also need to define the tag for modifiers,
# words that modify the score of the following word
# (e.g., *very* good, *not good, ...)

sentiment = Sentiment(
    path=os.path.join(MODULE, "xx-sentiment.xml"),
    synset=None,
    negations=("no", "not", "never"),
    modifiers=("RB", ),
    modifier=lambda w: w.endswith("ly"),  # brilliantly, hardly, partially, ...
    language="xx")

# Nothing should be changed below.


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
Exemple #3
0
    return tokens


class Parser(_Parser):
    def find_lemmata(self, tokens, **kwargs):
        return find_lemmata(tokens)


lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"),
                  morphology=os.path.join(MODULE, "en-morphology.txt"),
                  context=os.path.join(MODULE, "en-context.txt"),
                  entities=os.path.join(MODULE, "en-entities.txt"),
                  language="en")
parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en")
sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"),
                      synset="wordnet_id",
                      language="en")
spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt"))


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)
Exemple #4
0
        ]
        return s

    def find_lemmata(self, tokens, **kwargs):
        return find_lemmata(tokens)


lexicon = Lexicon(path=os.path.join(MODULE, "fr-lexicon.txt"),
                  morphology=os.path.join(MODULE, "fr-morphology.txt"),
                  context=os.path.join(MODULE, "fr-context.txt"),
                  language="fr")
parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="fr")
sentiment = Sentiment(path=os.path.join(MODULE, "fr-sentiment.xml"),
                      synset=None,
                      negations=("ne", "ni", "non", "pas", "rien", "sans",
                                 "aucun", "jamais"),
                      modifiers=("RB", ),
                      modifier=lambda w: w.endswith("ment"),
                      language="fr")


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)
Exemple #5
0
                "map", lambda token, tag: penntreebank2universal(token, tag))
        return _Parser.find_tags(self, tokens, **kwargs)


lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"),
                  morphology=os.path.join(MODULE, "en-morphology.txt"),
                  context=os.path.join(MODULE, "en-context.txt"),
                  entities=os.path.join(MODULE, "en-entities.txt"),
                  language="en")

parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en")

sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"),
                      synset="wordnet_id",
                      negations=("no", "not", "n't", "never"),
                      modifiers=("RB", ),
                      modifier=lambda w: w.endswith("ly"),
                      tokenizer=parser.find_tokens,
                      language="en")

spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt"))


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.