def _tweet2wseq(self, msg): """Convert tweet to a sequence of word lemmas if these words are informative. Args: msg (cgsa.data.Tweet): input message Return: list: lemmas of informative words """ return [normlex(w.lemma) for w in msg if is_relevant(w.form)]
def _get_test_w_emb_i(self, a_word): """Obtain embedding index for the given word. Args: a_word (str): word whose embedding index should be retrieved Returns: int: embedding index od the given word """ a_word = normlex(a_word) return self._w2i.get(a_word, UNK_IDX)
def get_lex_emb_i(self, a_word): """Obtain lexicon embedding ind for the given word. Args: a_word (str): word whose embedding index should be retrieved Returns: int: embedding index of the given word """ a_word = normlex(a_word) if a_word in self._w2lex_i: return self._w2lex_i[a_word] return UNK_IDX
def _get_test_w2v_lstsq_emb(self, a_word): """Obtain embedding index for the given word. Args: a_word (str): word whose embedding index should be retrieved Returns: np.array: embedding of the input word """ a_word = normlex(a_word) emb_i = self._w2i.get(a_word) if emb_i is None: if a_word in self._embeddings: return np.dot(self._embeddings[a_word], self._lstsq_mtx) return self._embs[UNK_IDX] return self._embs[emb_i]
def _get_train_w2v_emb_i(self, a_word): """Obtain embedding index for the given word. Args: a_word (str): word whose embedding index should be retrieved Returns: int: embedding index of the given word """ a_word = normlex(a_word) if a_word in self._w2i: return self._w2i[a_word] elif a_word in self._embeddings: i = self._w2i[a_word] = len(self._w2i) return i else: return UNK_IDX
def _get_train_w_emb_i(self, a_word): """Obtain embedding index for the given word. Args: a_word (str): word whose embedding index should be retrieved Returns: int: embedding index of the given word """ a_word = normlex(a_word) if a_word in self._w2i: return self._w2i[a_word] elif self._w_stat[a_word] < 2 and np.random.binomial(1, UNK_PROB): return UNK_IDX else: i = self._w2i[a_word] = len(self._w2i) return i
def _get_lexicon_w_emb_i(self, a_word): """Obtain embedding index for a lexicon term. Args: a_word (str): word whose embedding index should be retrieved Returns: int: embedding index of the given word """ a_word = normlex(a_word) if a_word in self._lexicon: scores = self._lexicon[a_word] max_score = -1. best_idx = -1 for idx, score in iteritems(scores): if score > max_score: max_score = score best_idx = idx return self.score_idx2emb_idx[best_idx] return UNK_IDX
def _read_lexicons(self, a_lextype2lex, lexicons, encoding=ENCODING): """Load lexicons. Args: a_lextype2lex (dict: lextype -> (dict, dict)): mapping from lexicon type to target dictionaries for storing terms (UNUSED) lexicons (list): paths to the lexicons to be loaded encoding (str): input encoding of the lexicons Returns: void: Note: populates `a_pos_term2polscore` and `a_neg_term2polscore` in place """ self._w2lex_i = {EMPTY_TOK: EMPTY_IDX, UNK_TOK: UNK_IDX} term2scores = defaultdict(dict) lex_pol2score_idx = dict() min_score = 1. for lexpath_i in lexicons: lexname = os.path.splitext(os.path.basename(lexpath_i))[0] self._logger.debug("Reading lexicon %s...", lexname) lexicon = pd.read_table(lexpath_i, header=None, names=LEX_CLMS, dtype=LEX_TYPES, encoding=encoding, error_bad_lines=False, warn_bad_lines=True, keep_default_na=False, na_values=[''], quoting=QUOTE_NONE) for i, row_i in lexicon.iterrows(): term = USCORE_RE.sub(' ', row_i.term) # since we do not recognize negated context, we skip negated # entries if NEG_SFX_RE.search(term): self._logger.warn( "Lexicon-based attention does not support negated" " entries. Skipping term %r.", term) continue term = normlex(term) lex_pol = (lexname, row_i.polarity) if lex_pol not in lex_pol2score_idx: lex_pol2score_idx[lex_pol] = len(lex_pol2score_idx) score_idx = lex_pol2score_idx[lex_pol] if term not in self._w2lex_i: self._w2lex_i[term] = len(self._w2lex_i) word_idx = self._w2lex_i[term] score = np.abs(row_i.score) term2scores[word_idx][score_idx] = score min_score = min(min_score, score) self._logger.debug("Lexicon %s read...", lexname) # digitize lexicon, converting it to a numpy array self.lexicon = np.zeros((len(self._w2lex_i), len(lex_pol2score_idx))) self.lexicon += min_score / 10. for w_idx, scores in iteritems(term2scores): for score_idx, score in iteritems(scores): self.lexicon[w_idx, score_idx] = score self.lexicon[EMPTY_IDX, :] = 0. self.lexicon[UNK_IDX, :] /= 1e2 return self.lexicon