Example #1
0
    def _tweet2wseq(self, msg):
        """Convert tweet to a sequence of word lemmas if these words are informative.

        Args:
          msg (cgsa.data.Tweet): input message

        Return:
          list: lemmas of informative words

        """
        return [normlex(w.lemma) for w in msg if is_relevant(w.form)]
Example #2
0
    def _get_test_w_emb_i(self, a_word):
        """Obtain embedding index for the given word.

        Args:
          a_word (str):
            word whose embedding index should be retrieved

        Returns:
          int:
            embedding index od the given word

        """
        a_word = normlex(a_word)
        return self._w2i.get(a_word, UNK_IDX)
Example #3
0
    def get_lex_emb_i(self, a_word):
        """Obtain lexicon embedding ind for the given word.

        Args:
          a_word (str):
            word whose embedding index should be retrieved

        Returns:
          int: embedding index of the given word

        """
        a_word = normlex(a_word)
        if a_word in self._w2lex_i:
            return self._w2lex_i[a_word]
        return UNK_IDX
Example #4
0
    def _get_test_w2v_lstsq_emb(self, a_word):
        """Obtain embedding index for the given word.

        Args:
          a_word (str):
            word whose embedding index should be retrieved

        Returns:
          np.array:
            embedding of the input word

        """
        a_word = normlex(a_word)
        emb_i = self._w2i.get(a_word)
        if emb_i is None:
            if a_word in self._embeddings:
                return np.dot(self._embeddings[a_word], self._lstsq_mtx)
            return self._embs[UNK_IDX]
        return self._embs[emb_i]
Example #5
0
    def _get_train_w2v_emb_i(self, a_word):
        """Obtain embedding index for the given word.

        Args:
          a_word (str):
            word whose embedding index should be retrieved

        Returns:
          int: embedding index of the given word

        """
        a_word = normlex(a_word)
        if a_word in self._w2i:
            return self._w2i[a_word]
        elif a_word in self._embeddings:
            i = self._w2i[a_word] = len(self._w2i)
            return i
        else:
            return UNK_IDX
Example #6
0
    def _get_train_w_emb_i(self, a_word):
        """Obtain embedding index for the given word.

        Args:
          a_word (str):
            word whose embedding index should be retrieved

        Returns:
          int:
            embedding index of the given word

        """
        a_word = normlex(a_word)
        if a_word in self._w2i:
            return self._w2i[a_word]
        elif self._w_stat[a_word] < 2 and np.random.binomial(1, UNK_PROB):
            return UNK_IDX
        else:
            i = self._w2i[a_word] = len(self._w2i)
            return i
Example #7
0
    def _get_lexicon_w_emb_i(self, a_word):
        """Obtain embedding index for a lexicon term.

        Args:
          a_word (str):
            word whose embedding index should be retrieved

        Returns:
          int:
            embedding index of the given word

        """
        a_word = normlex(a_word)
        if a_word in self._lexicon:
            scores = self._lexicon[a_word]
            max_score = -1.
            best_idx = -1
            for idx, score in iteritems(scores):
                if score > max_score:
                    max_score = score
                    best_idx = idx
            return self.score_idx2emb_idx[best_idx]
        return UNK_IDX
Example #8
0
    def _read_lexicons(self, a_lextype2lex, lexicons, encoding=ENCODING):
        """Load lexicons.

        Args:
          a_lextype2lex (dict: lextype -> (dict, dict)): mapping from
            lexicon type to target dictionaries for storing terms (UNUSED)
          lexicons (list): paths to the lexicons to be loaded
          encoding (str): input encoding of the lexicons

        Returns:
          void:

        Note:
          populates `a_pos_term2polscore` and `a_neg_term2polscore` in place

        """
        self._w2lex_i = {EMPTY_TOK: EMPTY_IDX, UNK_TOK: UNK_IDX}
        term2scores = defaultdict(dict)
        lex_pol2score_idx = dict()
        min_score = 1.
        for lexpath_i in lexicons:
            lexname = os.path.splitext(os.path.basename(lexpath_i))[0]
            self._logger.debug("Reading lexicon %s...", lexname)
            lexicon = pd.read_table(lexpath_i,
                                    header=None,
                                    names=LEX_CLMS,
                                    dtype=LEX_TYPES,
                                    encoding=encoding,
                                    error_bad_lines=False,
                                    warn_bad_lines=True,
                                    keep_default_na=False,
                                    na_values=[''],
                                    quoting=QUOTE_NONE)
            for i, row_i in lexicon.iterrows():
                term = USCORE_RE.sub(' ', row_i.term)
                # since we do not recognize negated context, we skip negated
                # entries
                if NEG_SFX_RE.search(term):
                    self._logger.warn(
                        "Lexicon-based attention does not support negated"
                        " entries.  Skipping term %r.", term)
                    continue
                term = normlex(term)
                lex_pol = (lexname, row_i.polarity)
                if lex_pol not in lex_pol2score_idx:
                    lex_pol2score_idx[lex_pol] = len(lex_pol2score_idx)
                score_idx = lex_pol2score_idx[lex_pol]
                if term not in self._w2lex_i:
                    self._w2lex_i[term] = len(self._w2lex_i)
                word_idx = self._w2lex_i[term]
                score = np.abs(row_i.score)
                term2scores[word_idx][score_idx] = score
                min_score = min(min_score, score)
            self._logger.debug("Lexicon %s read...", lexname)
        # digitize lexicon, converting it to a numpy array
        self.lexicon = np.zeros((len(self._w2lex_i), len(lex_pol2score_idx)))
        self.lexicon += min_score / 10.
        for w_idx, scores in iteritems(term2scores):
            for score_idx, score in iteritems(scores):
                self.lexicon[w_idx, score_idx] = score
        self.lexicon[EMPTY_IDX, :] = 0.
        self.lexicon[UNK_IDX, :] /= 1e2
        return self.lexicon