コード例 #1
0
 def __init__(self, wordnet16_dir=None, wn_domains_dir=None):
     """Initializes the WordNet-Affect object."""
     wordnet16_dir = wordnet16_dir or join(dirname(__file__), "wordnet-1.6")
     wn_domains_dir = wn_domains_dir or join(dirname(__file__),
                                             "wn-domains-3.2")
     cwd = os.getcwd()
     nltk.data.path.append(cwd)
     wn16_path = "{0}/dict".format(wordnet16_dir)
     self.wn16 = WordNetCorpusReader(
         os.path.abspath("{0}/{1}".format(cwd, wn16_path)),
         nltk.data.find(wn16_path))
     self.flat_pos = {
         'NN': 'NN',
         'NNS': 'NN',
         'JJ': 'JJ',
         'JJR': 'JJ',
         'JJS': 'JJ',
         'RB': 'RB',
         'RBR': 'RB',
         'RBS': 'RB',
         'VB': 'VB',
         'VBD': 'VB',
         'VGB': 'VB',
         'VBN': 'VB',
         'VBP': 'VB',
         'VBZ': 'VB'
     }
     self.wn_pos = {
         'NN': self.wn16.NOUN,
         'JJ': self.wn16.ADJ,
         'VB': self.wn16.VERB,
         'RB': self.wn16.ADV
     }
     self._load_emotions(wn_domains_dir)
     self.synsets = self._load_synsets(wn_domains_dir)
def main():
    ft = FasttextVectorizer("models/cc.en.300.bin")
    wn2 = WordNetCorpusReader(
        'D:\\dialogue2020\\semevals\\semeval-2016-task-14\\WN1.6', None)
    wn3 = WordNetCorpusReader(
        'D:\\dialogue2020\\semevals\\semeval-2016-task-14\\WN3.0', None)
    input_path = "D:/dialogue2020/semevals/semeval-2016-task-14/reader/"
    vector_path = "models/vectors/fasttext/en/new"

    # vectorize wordnet
    noun_synsets = compute_synsets_from_wordnets(wn2, wn3, 'n')
    verb_synsets = compute_synsets_from_wordnets(wn2, wn3, 'v')
    ft.vectorize_groups(noun_synsets,
                        os.path.join(vector_path,
                                     "nouns_wordnet_fasttext_1.6-3.0.txt"),
                        to_upper=False)
    ft.vectorize_groups(verb_synsets,
                        os.path.join(vector_path,
                                     "verbs_wordnet_fasttext_1.6-3.0.txt"),
                        to_upper=False)

    # vectorize words
    process_data(
        ft, os.path.join(input_path, "no_labels_nouns_en_new.1.6-3.0.tsv"),
        os.path.join(vector_path, "nouns_fasttext_cut_1.6-3.0.txt"))
    process_data(
        ft, os.path.join(input_path, "no_labels_verbs_en_new.1.6-3.0.tsv"),
        os.path.join(vector_path, "verbs_fasttext_cut_1.6-3.0.txt"))
コード例 #3
0
 def __init__(self):
     super(Antonimos, self).__init__()
     self.nombre = "Antonimos"
     self.descripcion = """
         Mide la cantidad de pares de antónimos presentes en el texto.
     """
     self.thread_safe = False  # Tiene problemas de concurrencia: https://github.com/nltk/nltk/issues/803
     self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)
コード例 #4
0
 def __init__(self, wordnet16_dir, wn_domains_dir):
     """Initializes the WordNet-Affect object."""
     
     cwd = os.getcwd()
     nltk.data.path.append(cwd)
     wn16_path = "{0}/dict".format(wordnet16_dir)
     self.wn16 = WordNetCorpusReader(os.path.abspath(wn16_path), nltk.data.find(wn16_path))
     self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
     self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV}
     self._load_emotions(wn_domains_dir)
     self.synsets = self._load_synsets(wn_domains_dir)
コード例 #5
0
class TestWordNet(unittest.TestCase):
    def setUp(self):
        self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)

    # Habría que actualizar el offset ya que cambió y el test deja de servir.
    # def test_invalid_literal_for_int_16(self):
    #     self.wncr._synset_from_pos_and_line('n',
    # '04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario' \
    #                                         + '_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @' \
    #                                         + ' 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467' \
    #                                         + ' v 0000 | comentario ingenioso para hacer reír  \n')

    def test_key_error(self):
        self.wncr.lemma("menor.a.09.menor").antonyms()
コード例 #6
0
def yield_17_candidates(corpus):
    wn16 = WordNetCorpusReader('wordnet/1.6/')
    wn17 = WordNetCorpusReader('wordnet/1.7.1/')

    for w in corpus.get_unique_words():
        synsets17 = wn17.synsets(w)
        lexclasses = list(set([s.lexname for s in synsets17]))
        synsets16 = wn16.synsets(w)
        if synsets16:
            continue
        if len(lexclasses) != 1:
            continue
        if 'noun' not in lexclasses[0]:
            continue
        yield w
コード例 #7
0
    def activate(self, *args, **kwargs):
        
        nltk.download('stopwords')
        self._stopwords = stopwords.words('english')
        #local_path=os.path.dirname(os.path.abspath(__file__))
        self._categories = {'anger': ['general-dislike',],
                            'fear': ['negative-fear',],
                            'disgust': ['shame',],
                            'joy': ['gratitude','affective','enthusiasm','love','joy','liking'],
                            'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']}

        self._wnaffect_mappings = {'anger': 'anger',
                                   'fear': 'negative-fear',
                                   'disgust': 'disgust',
                                   'joy': 'joy',
                                   'sadness': 'sadness'}


        self._load_emotions(self.hierarchy_path)
                
        if 'total_synsets' not in self.sh:
            total_synsets = self._load_synsets(self.synsets_path)
            self.sh['total_synsets'] = total_synsets
        
        self._total_synsets = self.sh['total_synsets']
        
        if 'wn16' not in self.sh:
            self._wn16_path = self.wn16_path
            wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path))
            self.sh['wn16'] = wn16
        
        self._wn16 = self.sh['wn16']
コード例 #8
0
class Antonimos(Feature):
    def __init__(self):
        super(Antonimos, self).__init__()
        self.nombre = "Antonimos"
        self.descripcion = """
            Mide la cantidad de pares de antónimos presentes en el texto.
        """
        self.thread_safe = False  # Tiene problemas de concurrencia: https://github.com/nltk/nltk/issues/803
        self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)

    def calcular_feature(self, tweet):
        oraciones = Freeling.procesar_texto(remover_hashtags(remover_usuarios(tweet.texto)))
        tokens = Freeling.get_tokens_de_oraciones(oraciones)

        cant_antonimos = 0

        for token in tokens:
            antonimos = []
            for synset in self.wncr.synsets(token.lemma):
                for lemma in synset.lemmas():
                    antonimos += [lemma_antonimo.name() for lemma_antonimo in lemma.antonyms()]

            for otro_token in tokens:
                if otro_token.lemma in antonimos:
                    cant_antonimos += 1
                    break

        if len(tokens) == 0:
            return 0
        else:
            return cant_antonimos / math.sqrt(len(tokens)) / 2.0  # divido entre 2 para contar una vez cada par
コード例 #9
0
def main():
    # python fasttext_vectorize_en.py models/cc.en.300.bin ../../datasets/WNs 2.0 models/vectors/fasttext/en ../../datasets/en

    if len(sys.argv) < 7:
        raise Exception(
            "Required arguments: <fasttext-path> <wn-dir> <old-version> <new-version> <vector-path> <input-path>"
        )

    ft = FasttextVectorizer(sys.argv[1])
    old = sys.argv[3]
    new = sys.argv[4]
    wn2 = WordNetCorpusReader(os.path.join(sys.argv[2], "WN" + old), None)
    vector_path = sys.argv[5]
    if not os.path.exists(vector_path):
        os.makedirs(vector_path)
    data_path = sys.argv[6]

    for pos in ['nouns', 'verbs']:
        synsets = compute_synsets_from_wordnets(wn2, pos[0])
        ft.vectorize_groups(
            synsets,
            os.path.join(vector_path,
                         f"{pos}_wordnet_fasttext_{old}-{new}.txt"), False)
        process_data(
            ft, os.path.join(data_path, f"no_labels_{pos}_en.{old}-{new}.tsv"),
            os.path.join(vector_path, f"{pos}_fasttext_{old}-{new}.txt"))
コード例 #10
0
def generate_taxonomy_fns(params, model):
    # for English WordNet
    if params['language'] == 'en':
        wn = WordNetCorpusReader(params["wordnet_path"], None)
        return lambda x: [hypernym.name() for hypernym in wn.synset(x).hypernyms()
                          if hypernym.name() in model.w2v_synsets.vocab], \
               lambda x: [hyponym.name() for hyponym in wn.synset(x).hyponyms() if hyponym.name()
                          in model.w2v_synsets.vocab], \
               lambda x: x.split(".")[0].replace("_", " ")
    # for RuWordNet
    elif params['language'] == 'ru':
        ruwordnet = RuWordnet(db_path=params["db_path"],
                              ruwordnet_path=params["wordnet_path"])
        return lambda x: ruwordnet.get_hypernyms_by_id(x), lambda x: ruwordnet.get_hyponyms_by_id(x), \
               lambda x: ruwordnet.get_name_by_id(x)
    else:
        raise Exception("task / language is not supported")
コード例 #11
0
def main():
    args = parse_args()
    ft_vec = FasttextVectorizer(args.fasttext_path)

    if args.data_path:
        # read data
        with open(args.data_path, 'r', encoding='utf-8') as f:
            dataset = [
                line.split("\t")[1].replace(" ", "_")
                for line in f.read().split("\n") if line
            ]

        # vectorize wordnet
        if "wordnet" in args:
            wn = WordNetCorpusReader(args.wordnet, None)
            for word in dataset:
                print(word, wn.synsets(word, pos=args.pos))
        else:
            ft_vec.vectorize_multiword_data(dataset,
                                            args.output_path,
                                            to_upper=False)

    elif args.data_dir:
        for system_dir in os.listdir(args.data_dir):
            for dirpath, _, filenames in os.walk(
                    os.path.join(args.data_dir, system_dir, args.language)):
                for filename in filenames:
                    if filename.endswith(".terms"):
                        input_path = os.path.join(dirpath, filename)
                        os.makedirs(os.path.join(args.output_path, system_dir),
                                    exist_ok=True)
                        output_path = os.path.join(
                            args.output_path, system_dir,
                            filename.replace(".terms", ".txt").replace(
                                system_dir + "_", ""))
                        with open(input_path, 'r', encoding='utf-8') as f:
                            dataset = [
                                line.split("\t")[1].replace(" ", "_")
                                for line in f.read().split("\n") if line
                            ]
                        ft_vec.vectorize_multiword_data(dataset,
                                                        output_path,
                                                        to_upper=False)
                        print(f"Processed: {filename}")
    else:
        raise Exception("Please, specify either --data_dir or --data_path")
コード例 #12
0
    def activate(self, *args, **kwargs):

        nltk.download(['stopwords', 'averaged_perceptron_tagger', 'wordnet'])
        self._stopwords = stopwords.words('english')
        self._wnlemma = wordnet.WordNetLemmatizer()
        self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'}
        local_path = os.path.dirname(os.path.abspath(__file__))
        self._categories = {
            'anger': [
                'general-dislike',
            ],
            'fear': [
                'negative-fear',
            ],
            'disgust': [
                'shame',
            ],
            'joy':
            ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'],
            'sadness': [
                'ingrattitude', 'daze', 'humility', 'compassion', 'despair',
                'anxiety', 'sadness'
            ]
        }

        self._wnaffect_mappings = {
            'anger': 'anger',
            'fear': 'negative-fear',
            'disgust': 'disgust',
            'joy': 'joy',
            'sadness': 'sadness'
        }

        self._load_emotions(local_path + self.hierarchy_path)

        if 'total_synsets' not in self.sh:
            total_synsets = self._load_synsets(local_path + self.synsets_path)
            self.sh['total_synsets'] = total_synsets

        self._total_synsets = self.sh['total_synsets']

        self._wn16_path = self.wn16_path
        self._wn16 = WordNetCorpusReader(
            os.path.abspath("{0}".format(local_path + self._wn16_path)),
            nltk.data.find(local_path + self._wn16_path))
コード例 #13
0
    def activate(self, *args, **kwargs):

        self._stopwords = stopwords.words('english')
        self._wnlemma = wordnet.WordNetLemmatizer()
        self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'}
        local_path = os.environ.get("SENPY_DATA")
        self._categories = {
            'anger': [
                'general-dislike',
            ],
            'fear': [
                'negative-fear',
            ],
            'disgust': [
                'shame',
            ],
            'joy':
            ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'],
            'sadness': [
                'ingrattitude', 'daze', 'humility', 'compassion', 'despair',
                'anxiety', 'sadness'
            ]
        }

        self._wnaffect_mappings = {
            'anger': 'anger',
            'fear': 'negative-fear',
            'disgust': 'disgust',
            'joy': 'joy',
            'sadness': 'sadness'
        }

        self._load_emotions(self.find_file(self.hierarchy_path))

        if 'total_synsets' not in self.sh:
            total_synsets = self._load_synsets(
                self.find_file(self.synsets_path))
            self.sh['total_synsets'] = total_synsets

        self._total_synsets = self.sh['total_synsets']

        self._wn16_path = self.wn16_path
        self._wn16 = WordNetCorpusReader(
            self.find_file(self._wn16_path),
            nltk.data.find(self.find_file(self._wn16_path)))
コード例 #14
0
def run_pos_parallel(specs, job_length):
    def sentiment_chunk(specs, start, finish):
        positive = [
            tuple(line.split(","))
            for line in open(specs["positive_filename"]).read().splitlines()
        ]
        negative = [
            tuple(line.split(","))
            for line in open(specs["negative_filename"]).read().splitlines()
        ]
        neutral = [
            tuple(line.split(","))
            for line in open(specs["neutral_filename"]).read().splitlines()
        ]
        lex = sentimentlexicon.SentimentLexicon(positive,
                                                negative,
                                                neutral,
                                                specs["pos"],
                                                start=start,
                                                finish=finish,
                                                weight=0.2)
        sentiment = lex.iterate()
        output_filename = "wn.%s.%s%s-%s.yaml" % (specs["classification"],
                                                  specs["pos"], start, finish)
        yaml.dump(sentiment, file(output_filename, "w"))
        return sentiment

    # Get the overall size, lc.
    synsets = list(WordNetCorpusReader(wn_root).all_synsets(pos=specs["pos"]))
    lc = {}
    for synset in synsets:
        for lemma in synset.lemmas:
            lc[lemma] = True
    lc = len(lc.keys())
    print "Lemma count", lc
    # Build the jobs.
    start = 0
    finish = job_length
    jobs = []
    ppservers = ()
    job_server = pp.Server(ppservers=ppservers)
    while start < lc:
        jobs.append(
            job_server.submit(sentiment_chunk, (specs, start, finish), (),
                              ("sentimentlexicon", "yaml")))
        start = finish
        finish += job_length
        if finish > lc: finish = lc
    print len(jobs), "jobs created; now running ..."
    all_sentiment = {}
    for job in jobs:
        sentiment = job()
        for key, val in sentiment.items():
            all_sentiment[key] = val
    job_server.print_stats()
    output_filename = "wn.%s.%s.yaml" % (specs["classification"], specs["pos"])
    yaml.dump(all_sentiment, file(output_filename, "w"))
コード例 #15
0
 def __init__(self, wordnet_version='3.5'):
     self.wordnet_version = wordnet_version
     if self.wordnet_version == '3.5':
         self.wn = wn
     else:
         nltk.data.path.append(ROOT_PATH)
         wn_dir = "wordnet/resources/WordNet-" + self.wordnet_version + '/'
         wn_path = "{0}/dict".format(wn_dir)
         self.wn = WordNetCorpusReader(
             os.path.abspath("{0}/{1}".format(ROOT_PATH, wn_path)),
             nltk.data.find(wn_path))
コード例 #16
0
 def __init__(self, wordnet16_dir, wn_domains_dir):
     """Initializes the WordNet-Affect object."""
     
     cwd = os.getcwd()
     nltk.data.path.append(cwd)
     wn16_path = "{0}/dict".format(wordnet16_dir)
     self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path))
     self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
     self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV}
     self._load_emotions(wn_domains_dir)
     self.synsets = self._load_synsets(wn_domains_dir)
コード例 #17
0
    def __init__(self, wordnet_home):
        assert exists(f'{wordnet_home}/WordNet-2.0'
                      ), f'error: missing WordNet-2.0 in {wordnet_home}'
        assert exists(f'{wordnet_home}/wn-domains-3.2'
                      ), f'error: missing WordNetDomains in {wordnet_home}'

        # load WordNet2.0
        self.wn = WordNetCorpusReader(f'{wordnet_home}/WordNet-2.0/dict',
                                      'WordNet-2.0/dict')

        # load WordNetDomains (based on https://stackoverflow.com/a/21904027/8759307)
        self.domain2synsets = defaultdict(list)
        self.synset2domains = defaultdict(list)
        for i in open(f'{wordnet_home}/wn-domains-3.2/wn-domains-3.2-20070223',
                      'r'):
            ssid, doms = i.strip().split('\t')
            doms = doms.split()
            self.synset2domains[ssid] = doms
            for d in doms:
                self.domain2synsets[d].append(ssid)
コード例 #18
0
class TestTransform(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.languages = ["cat", "eng", "eus", "glg", "spa"]
        cls.wn_names = {}
        for lang in cls.languages:
            cls.wn_names[lang] = '.wordnet_' + lang
            with tarfile.open('wordnet_' + lang + '.tar.gz') as f:
                f.extractall(cls.wn_names[lang])

    def test_all_synsets(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.all_synsets():
            a = synset
        # success if there is no error
        # This will also test that all synsets in data files are in index files.

    def test_invalid_literal_for_int_16(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.synsets("agudeza"):
            a = synset


#        self.wncr._synset_from_pos_and_line('n',
#                                            "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír  \n")
#        # success if there is no error

    def test_key_error(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        self.wncr.lemma("menor.a.09.menor").antonyms()
        # success if there is no error

    def test_load_wordnet(self):
        for lang in self.languages:
            self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
            # success if there is no error

    @classmethod
    def tearDownClass(cls):
        for lang in cls.languages:
            shutil.rmtree(cls.wn_names[lang])
コード例 #19
0
def main():
    # python en_dataset_creation.py ../../datasets/WNs ../../datasets/en/ 2.0 3.0

    if len(sys.argv) < 3:
        raise Exception(
            "The following arguments are required:<WordNet path> <output_path> <old_version_float> <new_version_float>"
        )

    path = sys.argv[1]
    out_path = sys.argv[2]
    old_version = sys.argv[3]

    if len(sys.argv) == 5:
        new_version = sys.argv[4]
    else:
        new_version = "3.0"

    wn2 = WordNetCorpusReader(os.path.join(path, 'WN' + old_version), None)
    wn3 = WordNetCorpusReader(os.path.join(path, 'WN' + new_version), None)

    for pos in ['nouns', 'verbs']:
        synsets_2n = set(wn2.all_synsets(pos[0]))
        synsets_3n = set(wn3.all_synsets(pos[0]))

        reference_nouns = synsets_3n.intersection(synsets_2n)
        new = extract_new_lemmas(synsets_3n.difference(synsets_2n), wn2,
                                 pos[0])
        hypernyms = generate_gold(new, wn3, reference_nouns, pos[0])

        print(f"Len {pos} {len(hypernyms)}")
        save(dict(hypernyms), out_path,
             f"{pos}_en.{old_version}-{new_version}.tsv")
コード例 #20
0
class TestTransform(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.languages = ["cat", "eng", "eus", "glg", "spa"]
        cls.wn_names = {}
        for lang in cls.languages:
            cls.wn_names[lang] = '.wordnet_' + lang
            with tarfile.open('wordnet_' + lang + '.tar.gz') as f:
                    f.extractall(cls.wn_names[lang])

    def test_all_synsets(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.all_synsets():
            a = synset
        # success if there is no error
        # This will also test that all synsets in data files are in index files.

    def test_invalid_literal_for_int_16(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        for synset in self.wncr.synsets("agudeza"):
            a = synset
#        self.wncr._synset_from_pos_and_line('n',
#                                            "04122387 00 n 0a agudeza 0 broma 0 chiste 0 chufleta 0 comentario_burlón 0 cuchufleta 0 idea 0 ocurrencia 0 pulla 0 salida 0 04 @ 04120601 n 0000 + 00620096 v 0000 + 00499330 v 0000 + 00558467 v 0000 | comentario ingenioso para hacer reír  \n")
#        # success if there is no error

    def test_key_error(self):
        self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
        self.wncr.lemma("menor.a.09.menor").antonyms()
        # success if there is no error

    def test_load_wordnet(self):
        for lang in self.languages:
            self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
            # success if there is no error

    @classmethod
    def tearDownClass(cls):
        for lang in cls.languages:
            shutil.rmtree(cls.wn_names[lang])
コード例 #21
0
 def initialize_s(self):
     """Builds the vectors s, as a dictionary mapping words to
     reals. The domain of the dictionary is the full vocabulary."""
     synsets = list(WordNetCorpusReader(wn_root).all_synsets(pos=self.pos))
     for synset in synsets:
         for lemma in synset.lemmas:
             if (lemma.name, synset.pos) in self.positive:
                 self.s[lemma] = 1.0
                 self.s0[lemma] = 1.0
             elif (lemma.name, synset.pos) in self.negative:
                 self.s[lemma] = -1.0
                 self.s0[lemma] = -1.0
             else:
                 self.s[lemma] = 0.0
                 self.s0[lemma] = 0.0
コード例 #22
0
 def __init__(self, wordnet16_dir, wn_domains_dir):
     """Initializes the WordNet-Affect object."""
     try:
         cwd = os.getcwd()
         nltk.data.path.append(cwd)
         wn16_path = "{0}/dict".format(wordnet16_dir)
         self.wn16 = WordNetCorpusReader(
             os.path.abspath("{0}/{1}".format(cwd, wn16_path)),
             nltk.data.find(wn16_path))
         self.flat_pos = {
             'NN': 'NN',
             'NNS': 'NN',
             'JJ': 'JJ',
             'JJR': 'JJ',
             'JJS': 'JJ',
             'RB': 'RB',
             'RBR': 'RB',
             'RBS': 'RB',
             'VB': 'VB',
             'VBD': 'VB',
             'VGB': 'VB',
             'VBN': 'VB',
             'VBP': 'VB',
             'VBZ': 'VB'
         }
         self.wn_pos = {
             'NN': self.wn16.NOUN,
             'JJ': self.wn16.ADJ,
             'VB': self.wn16.VERB,
             'RB': self.wn16.ADV
         }
         self._load_emotions(wn_domains_dir)
         self.synsets = self._load_synsets(wn_domains_dir)
     except:
         print "Please download the dependencies and re-run the script after installing them successfully. Exiting !"
         exit()
コード例 #23
0
    def __init__(self, info, *args, **kwargs):
        super(EmotionTextPlugin, self).__init__(info, *args, **kwargs)
        self.id = info['module']
        self.info = info
        self._stopwords = stopwords.words('english')
        local_path=os.path.dirname(os.path.abspath(__file__))
        self._categories = {'anger': ['general-dislike',],
                            'fear': ['negative-fear',],
                            'disgust': ['shame',],
                            'joy': ['gratitude','affective','enthusiasm','love','joy','liking'],
                            'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']}

        self._wnaffect_mappings = {'anger': 'anger',
                                   'fear': 'negative-fear',
                                   'disgust': 'disgust',
                                   'joy': 'joy',
                                   'sadness': 'sadness'}

        self._load_emotions(local_path+self.info['hierarchy_path'])     
        self._total_synsets = self._load_synsets(local_path+self.info['synsets_path'])
        self._wn16_path = local_path+self.info['wn16_path']
        self._wn16= None
        self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path))
コード例 #24
0
    output_path = os.path.join(exp_folder, 'wsd_framework_results.json')
    with open(output_path, 'w') as outfile:
        json.dump(results, outfile)



if __name__ == '__main__':
    #exp_folder = 'coling2018/synset-se13-semcor'
    exp_folder = 'coling2018/synset-se2-framework-semcor'
    scorer_folder = '/Users/marten/Downloads/WSD_Unified_Evaluation_Datasets'

    from nltk.corpus import WordNetCorpusReader

    if any(['se13' in exp_folder,
            'framework' in exp_folder]):
        from nltk.corpus import wordnet as wn
    elif 'se2' in exp_folder:
        path_to_wn_dict_folder = '/Users/marten/Downloads/WordNet-1.7.1/dict'
        wn = WordNetCorpusReader(path_to_wn_dict_folder, None)

    create_key_file(wn, exp_folder, debug=1)
    score_using_official_scorer(exp_folder, scorer_folder)







コード例 #25
0
 def test_load_wordnet(self):
     for lang in self.languages:
         self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
コード例 #26
0
#!/usr/bin/env python

import sys
from nltk.corpus import WordNetCorpusReader

dict_dir = sys.argv[1]

wn = WordNetCorpusReader(dict_dir)

for synset in wn.all_synsets():
    for lem in synset.lemmas:
        print lem.name, synset.lexname


コード例 #27
0
from nltk.corpus import WordNetCorpusReader
from nltk.corpus import wordnet as wn

import csv   #csvモジュールをインポートする
import pandas as pd

from scipy.stats import spearmanr

import re

#wordnet-1.7.1 の読み込み
cwd = os.getcwd()
nltk.data.path.append(cwd)
wordnet17_dir="resources/WordNet-1.7.1/"
wn17_path = "{0}/dict".format(wordnet17_dir)
WN17 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn17_path)), nltk.data.find(wn17_path))

if __name__ == '__main__':
    mapping = ['AutoExtend', 'GlossTfIdf', 'GlossAve', 'Word']
    S = {}
    N = 999 #datanum

    # 学習済みモデルのロード
    model = word2vec.Word2Vec.load_word2vec_format("../word2vec/models/GoogleNews-vectors-negative300.bin", binary=True)

    with open('synset2vecAE.pickle', 'rb') as f:
        S['AutoExtend'] = pickle.load(f)

    with open('synset2vecG.pickle', 'rb') as f:
        S['GlossTfIdf'] = pickle.load(f)
コード例 #28
0
 def test_all_synsets(self):
     self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
     for synset in self.wncr.all_synsets():
         a = synset
コード例 #29
0
from nltk.corpus import WordNetCorpusReader
from fasttext_vectorize_en import compute_synsets_from_wordnets

wn2 = WordNetCorpusReader(
    'D:\\dialogue2020\\semevals\\semeval-2016-task-14\\WN1.7', None)
wn3 = WordNetCorpusReader('D:\\dialogue2020\\semeval-2016-task-14\\WN3.0',
                          None)
input_path = "D:/dialogue2020/semeval-2016-task-14/reader/"
vector_path = "models/vectors/fasttext/en/"

# vectorize wordnet
noun_synsets = compute_synsets_from_wordnets(wn2, wn3, 'n')
verb_synsets = compute_synsets_from_wordnets(wn2, wn3, 'v')
コード例 #30
0
 def test_all_synsets(self):
     self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
     for synset in self.wncr.all_synsets():
         a = synset
コード例 #31
0
 def test_invalid_literal_for_int_16(self):
     self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
     for synset in self.wncr.synsets("agudeza"):
         a = synset
コード例 #32
0
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    bert_vectorizer = BertVectorizer(args.bert_path)

    if 'ruwordnet_path' in args:
        ruwordnet = RuWordnet(args.ruwordnet_path, None)
        synsets = defaultdict(list)
        for sense_id, synset_id, text in ruwordnet.get_all_senses():
            if synset_id.endswith(args.pos):
                synsets[synset_id].append(text.lower())
        bert_vectorizer.vectorize_groups(synsets,
                                         args.output_path,
                                         to_upper=False)

    if 'wordnet_old' in args:
        wn_old = WordNetCorpusReader(args.wordnet_old, None)
        wn_new = WordNetCorpusReader(args.wordnet_new, None)
        synsets = compute_synsets_from_wordnets(wn_old, wn_new, args.pos)
        bert_vectorizer.vectorize_groups(synsets,
                                         args.output_path,
                                         to_upper=False)

    if "data_path" in args:
        data = read_file(args.data_path, lower=args.upper)
        bert_vectorizer.vectorize_data(data,
                                       args.output_path,
                                       upper=args.upper)
コード例 #33
0
class WNAffect(EmotionPlugin, ShelfMixin):
    '''
    Emotion classifier using WordNet-Affect to calculate the percentage
    of each emotion. This plugin classifies among 6 emotions: anger,fear,disgust,joy,sadness
    or neutral. The only available language is English (en)
    '''
    name = 'emotion-wnaffect'
    author = ["@icorcuera", "@balkian"]
    version = '0.2'
    extra_params = {
        'language': {
            "@id": 'lang_wnaffect',
            'description': 'language of the input',
            'aliases': ['language', 'l'],
            'required': True,
            'options': [
                'en',
            ]
        }
    }
    synsets_path = "a-synsets.xml"
    hierarchy_path = "a-hierarchy.xml"
    wn16_path = "wordnet1.6/dict"
    onyx__usesEmotionModel = "emoml:big6"
    nltk_resources = ['stopwords', 'averaged_perceptron_tagger', 'wordnet']

    def _load_synsets(self, synsets_path):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        tree = ET.parse(synsets_path)
        root = tree.getroot()
        pos_map = {"noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB"}

        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(
                    pos, pos)):
                offset = int(elem.get("id")[2:])
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emo.emotions[elem.get(
                        "categ")] if elem.get(
                            "categ") in Emo.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(
                        elem.get("noun-id")[2:])]
        return synsets

    def _load_emotions(self, hierarchy_path):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""

        tree = ET.parse(hierarchy_path)
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emo.emotions["root"] = Emo("root")
            else:
                Emo.emotions[name] = Emo(name, elem.get("isa"))

    def activate(self, *args, **kwargs):

        self._stopwords = stopwords.words('english')
        self._wnlemma = wordnet.WordNetLemmatizer()
        self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'}
        local_path = os.environ.get("SENPY_DATA")
        self._categories = {
            'anger': [
                'general-dislike',
            ],
            'fear': [
                'negative-fear',
            ],
            'disgust': [
                'shame',
            ],
            'joy':
            ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'],
            'sadness': [
                'ingrattitude', 'daze', 'humility', 'compassion', 'despair',
                'anxiety', 'sadness'
            ]
        }

        self._wnaffect_mappings = {
            'anger': 'anger',
            'fear': 'negative-fear',
            'disgust': 'disgust',
            'joy': 'joy',
            'sadness': 'sadness'
        }

        self._load_emotions(self.find_file(self.hierarchy_path))

        if 'total_synsets' not in self.sh:
            total_synsets = self._load_synsets(
                self.find_file(self.synsets_path))
            self.sh['total_synsets'] = total_synsets

        self._total_synsets = self.sh['total_synsets']

        self._wn16_path = self.wn16_path
        self._wn16 = WordNetCorpusReader(
            self.find_file(self._wn16_path),
            nltk.data.find(self.find_file(self._wn16_path)))

    def deactivate(self, *args, **kwargs):
        self.save()

    def _my_preprocessor(self, text):

        regHttp = re.compile(
            '(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regHttps = re.compile(
            '(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
        text = re.sub(regHttp, '', text)
        text = re.sub(regAt, '', text)
        text = re.sub('RT : ', '', text)
        text = re.sub(regHttps, '', text)
        text = re.sub('[0-9]', '', text)
        text = self._delete_punctuation(text)
        return text

    def _delete_punctuation(self, text):

        exclude = set(string.punctuation)
        s = ''.join(ch for ch in text if ch not in exclude)
        return s

    def _extract_ngrams(self, text):

        unigrams_lemmas = []
        pos_tagged = []
        unigrams_words = []
        tokens = text.split()
        for token in nltk.pos_tag(tokens):
            unigrams_words.append(token[0])
            pos_tagged.append(token[1])
            if token[1][0] in self._syntactics.keys():
                unigrams_lemmas.append(
                    self._wnlemma.lemmatize(token[0],
                                            self._syntactics[token[1][0]]))
            else:
                unigrams_lemmas.append(token[0])

        return unigrams_words, unigrams_lemmas, pos_tagged

    def _find_ngrams(self, input_list, n):
        return zip(*[input_list[i:] for i in range(n)])

    def _clean_pos(self, pos_tagged):

        pos_tags = {
            'NN': 'NN',
            'NNP': 'NN',
            'NNP-LOC': 'NN',
            'NNS': 'NN',
            'JJ': 'JJ',
            'JJR': 'JJ',
            'JJS': 'JJ',
            'RB': 'RB',
            'RBR': 'RB',
            'RBS': 'RB',
            'VB': 'VB',
            'VBD': 'VB',
            'VGB': 'VB',
            'VBN': 'VB',
            'VBP': 'VB',
            'VBZ': 'VB'
        }

        for i in range(len(pos_tagged)):
            if pos_tagged[i] in pos_tags:
                pos_tagged[i] = pos_tags[pos_tagged[i]]
        return pos_tagged

    def _extract_features(self, text):

        feature_set = {k: 0 for k in self._categories}
        ngrams_words, ngrams_lemmas, pos_tagged = self._extract_ngrams(text)
        matches = 0
        pos_tagged = self._clean_pos(pos_tagged)

        tag_wn = {
            'NN': self._wn16.NOUN,
            'JJ': self._wn16.ADJ,
            'VB': self._wn16.VERB,
            'RB': self._wn16.ADV
        }
        for i in range(len(pos_tagged)):
            if pos_tagged[i] in tag_wn:
                synsets = self._wn16.synsets(ngrams_words[i],
                                             tag_wn[pos_tagged[i]])
                if synsets:
                    offset = synsets[0].offset()
                    if offset in self._total_synsets[pos_tagged[i]]:
                        if self._total_synsets[pos_tagged[i]][offset] is None:
                            continue
                        else:
                            emotion = self._total_synsets[
                                pos_tagged[i]][offset].get_level(5).name
                            matches += 1
                            for i in self._categories:
                                if emotion in self._categories[i]:
                                    feature_set[i] += 1
        if matches == 0:
            matches = 1

        for i in feature_set:
            feature_set[i] = (feature_set[i] / matches)

        return feature_set

    def analyse_entry(self, entry, activity):
        params = activity.params

        text_input = entry['nif:isString']

        text = self._my_preprocessor(text_input)

        feature_text = self._extract_features(text)

        emotionSet = EmotionSet(id="Emotions0")
        emotions = emotionSet.onyx__hasEmotion

        for i in feature_text:
            emotions.append(
                Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i],
                        onyx__hasEmotionIntensity=feature_text[i]))

        entry.emotions = [emotionSet]

        yield entry

    def test(self, *args, **kwargs):
        results = list()
        params = {
            'algo': 'emotion-wnaffect',
            'intype': 'direct',
            'expanded-jsonld': 0,
            'informat': 'text',
            'prefix': '',
            'plugin_type': 'analysisPlugin',
            'urischeme': 'RFC5147String',
            'outformat': 'json-ld',
            'i': 'Hello World',
            'input': 'Hello World',
            'conversion': 'full',
            'language': 'en',
            'algorithm': 'emotion-wnaffect'
        }

        self.activate()
        texts = {
            'I hate you': 'anger',
            'i am sad': 'sadness',
            'i am happy with my marks': 'joy',
            'This movie is scary': 'negative-fear'
        }

        for text in texts:
            response = next(
                self.analyse_entry(Entry(nif__isString=text),
                                   self.activity(params)))
            expected = texts[text]
            emotionSet = response.emotions[0]
            max_emotion = max(emotionSet['onyx:hasEmotion'],
                              key=lambda x: x['onyx:hasEmotionIntensity'])
            assert max_emotion['onyx:hasEmotionCategory'] == expected
コード例 #34
0
import nltk
from nltk.corpus import WordNetCorpusReader
from sqlalchemy import *
from xml.dom import minidom
from nltk.corpus import wordnet as wn

import difflib
import pickle

#wordnet-1.6 の読み込み
cwd = os.getcwd()
nltk.data.path.append(cwd)
wordnet16_dir = "resources/wordnet-1.6/"
wn16_path = "{0}/dict".format(wordnet16_dir)
WN16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)),
                           nltk.data.find(wn16_path))


# load Wordnet-Affect synsets
# corpus: a-synset.xml
# return: {
#   'noun': {
#     '05586574': { 'categ': 'electricity', 'pos': 'noun', 'offset16': '05586574' }
#   }, ...
# }
def load_asynsets(corpus):
    tree = ET.parse(corpus)
    root = tree.getroot()

    asynsets = {}
    for pos in ["noun", "adj", "verb", "adv"]:
コード例 #35
0
 def test_key_error(self):
     self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
     self.wncr.lemma("menor.a.09.menor").antonyms()
コード例 #36
0
class WNAffect:
    """WordNet-Affect ressource."""
    
    def __init__(self, wordnet16_dir, wn_domains_dir):
        """Initializes the WordNet-Affect object."""
        
        cwd = os.getcwd()
        nltk.data.path.append(cwd)
        wn16_path = "{0}/dict".format(wordnet16_dir)
        self.wn16 = WordNetCorpusReader(os.path.abspath("{0}/{1}".format(cwd, wn16_path)), nltk.data.find(wn16_path))
        self.flat_pos = {'NN':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB', 'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}
        self.wn_pos = {'NN':self.wn16.NOUN, 'JJ':self.wn16.ADJ, 'VB':self.wn16.VERB, 'RB':self.wn16.ADV}
        self._load_emotions(wn_domains_dir)
        self.synsets = self._load_synsets(wn_domains_dir)
        


    def _load_synsets(self, wn_domains_dir):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        
        tree = ET.parse("{0}/a-synsets.xml".format(wn_domains_dir))
        root = tree.getroot()
        pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }
    
        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
                offset = int(elem.get("id")[2:])                
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emotion.emotions[elem.get("categ")] if elem.get("categ") in Emotion.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
    
        return synsets
        
    def _load_emotions(self, wn_domains_dir):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""
        
        tree = ET.parse("{0}/a-hierarchy.xml".format(wn_domains_dir))
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emotion.emotions["root"] = Emotion("root")
            else:
                Emotion.emotions[name] = Emotion(name, elem.get("isa"))
    
    def get_emotion(self, word, pos):
        """Returns the emotion of the word.
            word -- the word (str)
            pos -- part-of-speech (str)
        """
        
        if pos in self.flat_pos:
            pos = self.flat_pos[pos]
            synsets = self.wn16.synsets(word, self.wn_pos[pos])         
            if synsets:
                offset = synsets[0].offset()
                if offset in self.synsets[pos]:
                    return self.synsets[pos][offset]
        return None
コード例 #37
0
def yield_single_sense_nouns_in_corpus(corpus):
    wn = WordNetCorpusReader('wordnet/1.6/')
    for word in corpus.get_unique_words():
        synsets = list(wn.synsets(word))
        if len(synsets) == 1 and 'noun' in synsets[0].lexname:
            yield word
コード例 #38
0
 def test_invalid_literal_for_int_16(self):
     self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
     for synset in self.wncr.synsets("agudeza"):
         a = synset
コード例 #39
0
def yield_single_supersense_nouns_in_corpus(corpus):
    wn = WordNetCorpusReader('wordnet/1.6/')
    for w in corpus.get_unique_words():
        lexclasses = list(set([s.lexname for s in wn.synsets(w)]))
        if len(lexclasses) == 1 and 'noun' in lexclasses[0]:
            yield w
コード例 #40
0
 def test_key_error(self):
     self.wncr = WordNetCorpusReader(self.wn_names['spa'], None)
     self.wncr.lemma("menor.a.09.menor").antonyms()
コード例 #41
0
class WNAffect(object):
    """WordNet-Affect resource."""
    def __init__(self, wordnet16_dir=None, wn_domains_dir=None):
        """Initializes the WordNet-Affect object."""
        wordnet16_dir = wordnet16_dir or join(dirname(__file__), "wordnet-1.6")
        wn_domains_dir = wn_domains_dir or join(dirname(__file__),
                                                "wn-domains-3.2")
        cwd = os.getcwd()
        nltk.data.path.append(cwd)
        wn16_path = "{0}/dict".format(wordnet16_dir)
        self.wn16 = WordNetCorpusReader(
            os.path.abspath("{0}/{1}".format(cwd, wn16_path)),
            nltk.data.find(wn16_path))
        self.flat_pos = {
            'NN': 'NN',
            'NNS': 'NN',
            'JJ': 'JJ',
            'JJR': 'JJ',
            'JJS': 'JJ',
            'RB': 'RB',
            'RBR': 'RB',
            'RBS': 'RB',
            'VB': 'VB',
            'VBD': 'VB',
            'VGB': 'VB',
            'VBN': 'VB',
            'VBP': 'VB',
            'VBZ': 'VB'
        }
        self.wn_pos = {
            'NN': self.wn16.NOUN,
            'JJ': self.wn16.ADJ,
            'VB': self.wn16.VERB,
            'RB': self.wn16.ADV
        }
        self._load_emotions(wn_domains_dir)
        self.synsets = self._load_synsets(wn_domains_dir)

    def _load_synsets(self, wn_domains_dir):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""

        tree = ET.parse(
            "{0}/wn-affect-1.1/a-synsets.xml".format(wn_domains_dir))
        root = tree.getroot()
        pos_map = {"noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB"}

        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(
                    pos, pos)):
                offset = int(elem.get("id")[2:])
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = WNAffectEmotion.emotions[elem.get(
                        "categ")] if elem.get(
                            "categ") in WNAffectEmotion.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(
                        elem.get("noun-id")[2:])]

        return synsets

    def _load_emotions(self, wn_domains_dir):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""

        tree = ET.parse(
            "{0}/wn-affect-1.1/a-hierarchy.xml".format(wn_domains_dir))
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                WNAffectEmotion.emotions["root"] = WNAffectEmotion("root")
            else:
                WNAffectEmotion.emotions[name] = WNAffectEmotion(
                    name, elem.get("isa"))

    def get_emotion(self, word, pos):
        """Returns the emotion of the word.
            word -- the word (str)
            pos -- part-of-speech (str)
        """

        if pos in self.flat_pos:
            pos = self.flat_pos[pos]
            synsets = self.wn16.synsets(word, self.wn_pos[pos])
            if synsets:
                for synset in synsets:
                    offset = synset.offset()
                    if offset in self.synsets[pos]:
                        return self.synsets[pos][offset]
        return None

    def get_emotion_synset(self, offset):
        """Returns the emotion of the synset.
            offset -- synset offset (int)
        """

        for pos in self.flat_pos.values():
            if offset in self.synsets[pos]:
                return self.synsets[pos][offset]
        return None
コード例 #42
0
 def test_load_wordnet(self):
     for lang in self.languages:
         self.wncr = WordNetCorpusReader(self.wn_names[lang], None)
コード例 #43
0
ファイル: testwordnet.py プロジェクト: bryant1410/pghumor
 def setUp(self):
     self.wncr = WordNetCorpusReader(resource_filename('clasificador.recursos', 'wordnet_spa'), None)
コード例 #44
0
class EmotionTextPlugin(EmotionPlugin):
    
    def __init__(self, info, *args, **kwargs):
        super(EmotionTextPlugin, self).__init__(info, *args, **kwargs)
        self.id = info['module']
        self.info = info
        self._stopwords = stopwords.words('english')
        local_path=os.path.dirname(os.path.abspath(__file__))
        self._categories = {'anger': ['general-dislike',],
                            'fear': ['negative-fear',],
                            'disgust': ['shame',],
                            'joy': ['gratitude','affective','enthusiasm','love','joy','liking'],
                            'sadness': ['ingrattitude','daze','humility','compassion','despair','anxiety','sadness']}

        self._wnaffect_mappings = {'anger': 'anger',
                                   'fear': 'negative-fear',
                                   'disgust': 'disgust',
                                   'joy': 'joy',
                                   'sadness': 'sadness'}

        self._load_emotions(local_path+self.info['hierarchy_path'])     
        self._total_synsets = self._load_synsets(local_path+self.info['synsets_path'])
        self._wn16_path = local_path+self.info['wn16_path']
        self._wn16= None
        self._wn16 = WordNetCorpusReader(os.path.abspath("{0}".format(self._wn16_path)), nltk.data.find(self._wn16_path))
        

    def _load_synsets(self, synsets_path):
        """Returns a dictionary POS tag -> synset offset -> emotion (str -> int -> str)."""
        tree = ET.parse(synsets_path)
        root = tree.getroot()
        pos_map = { "noun": "NN", "adj": "JJ", "verb": "VB", "adv": "RB" }

        synsets = {}
        for pos in ["noun", "adj", "verb", "adv"]:
            tag = pos_map[pos]
            synsets[tag] = {}
            for elem in root.findall(".//{0}-syn-list//{0}-syn".format(pos, pos)):
                offset = int(elem.get("id")[2:])                
                if not offset: continue
                if elem.get("categ"):
                    synsets[tag][offset] = Emo.emotions[elem.get("categ")] if elem.get("categ") in Emo.emotions else None
                elif elem.get("noun-id"):
                    synsets[tag][offset] = synsets[pos_map["noun"]][int(elem.get("noun-id")[2:])]
        return synsets

    def _load_emotions(self, hierarchy_path):
        """Loads the hierarchy of emotions from the WordNet-Affect xml."""

        tree = ET.parse(hierarchy_path)
        root = tree.getroot()
        for elem in root.findall("categ"):
            name = elem.get("name")
            if name == "root":
                Emo.emotions["root"] = Emo("root")
            else:
                Emo.emotions[name] = Emo(name, elem.get("isa"))

    def activate(self, *args, **kwargs):
        logger.info("EmoText plugin is ready to go!")

    def deactivate(self, *args, **kwargs):

        logger.info("EmoText plugin is being deactivated...")

    def _my_preprocessor(self, text):

        regHttp = re.compile('(http://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regHttps = re.compile('(https://)[a-zA-Z0-9]*.[a-zA-Z0-9/]*(.[a-zA-Z0-9]*)?')
        regAt = re.compile('@([a-zA-Z0-9]*[*_/&%#@$]*)*[a-zA-Z0-9]*')
        text = re.sub(regHttp, '', text)
        text = re.sub(regAt, '', text)
        text = re.sub('RT : ', '', text)
        text = re.sub(regHttps, '', text)
        text = re.sub('[0-9]', '', text)
        text = self._delete_punctuation(text)
        return text

    def _delete_punctuation(self, text):

        exclude = set(string.punctuation)
        s = ''.join(ch for ch in text if ch not in exclude)
        return s

    def _extract_ngrams(self, text):

        unigrams_lemmas = []
        pos_tagged = []
        unigrams_words = []
        sentences = parse(text,lemmata=True).split()
        for sentence in sentences:
            for token in sentence:
                if token[0].lower() not in self._stopwords:
                    unigrams_words.append(token[0].lower())
                    unigrams_lemmas.append(token[4])  
                    pos_tagged.append(token[1])        

        return unigrams_words,unigrams_lemmas,pos_tagged

    def _find_ngrams(self, input_list, n):
        return zip(*[input_list[i:] for i in range(n)])

    def _clean_pos(self, pos_tagged):

        pos_tags={'NN':'NN', 'NNP':'NN','NNP-LOC':'NN', 'NNS':'NN', 'JJ':'JJ', 'JJR':'JJ', 'JJS':'JJ', 'RB':'RB', 'RBR':'RB',
        'RBS':'RB', 'VB':'VB', 'VBD':'VB', 'VGB':'VB', 'VBN':'VB', 'VBP':'VB', 'VBZ':'VB'}

        for i in range(len(pos_tagged)):
            if pos_tagged[i] in pos_tags:
                pos_tagged[i]=pos_tags[pos_tagged[i]]
        return pos_tagged
    
    def _extract_features(self, text):

        feature_set={k:0 for k in self._categories}
        ngrams_words,ngrams_lemmas,pos_tagged = self._extract_ngrams(text)
        matches=0
        pos_tagged=self._clean_pos(pos_tagged)

        tag_wn={'NN':self._wn16.NOUN,'JJ':self._wn16.ADJ,'VB':self._wn16.VERB,'RB':self._wn16.ADV}
        for i in range(len(pos_tagged)):
            if pos_tagged[i] in tag_wn:
                synsets = self._wn16.synsets(ngrams_words[i], tag_wn[pos_tagged[i]])   
                if synsets:
                    offset = synsets[0].offset()
                    if offset in self._total_synsets[pos_tagged[i]]:
                        if self._total_synsets[pos_tagged[i]][offset] is None:
                            continue
                        else:
                            emotion = self._total_synsets[pos_tagged[i]][offset].get_level(5).name
                            matches+=1
                            for i in self._categories:
                                if emotion in self._categories[i]:
                                    feature_set[i]+=1
        if matches == 0:
            matches=1                

        for i in feature_set:
            feature_set[i] = (feature_set[i]/matches)*100

        return feature_set

    def analyse(self, **params):

        logger.debug("Analysing with params {}".format(params))

        text_input = params.get("input", None)

        text=self._my_preprocessor(text_input)

        feature_text=self._extract_features(text)

        response = Results()

        entry = Entry(id="Entry",
                      text=text_input)
        emotionSet = EmotionSet(id="Emotions0")
        emotions = emotionSet.onyx__hasEmotion

        for i in feature_text:
            emotions.append(Emotion(onyx__hasEmotionCategory=self._wnaffect_mappings[i],
                                    onyx__hasEmotionIntensity=feature_text[i]))

        entry.emotions = [emotionSet]
        response.entries.append(entry)
        return response