def get_boilerplate(self, X_df): """ get data from boilerplate using top words (transform) """ keywords = self.keywords N = X_df.shape[0] D = {kw: [] for kw in keywords} for i in range(N): observation = X_df[i, 2] d = json.loads(observation) for kw in keywords: if kw in d and d[kw]: D[kw].append(d[kw]) else: D[kw].append('') wordlists = {} for kw in keywords: use_bigram = self.params[kw]['bgram'] voc = self.topwords[kw] dlist = D[kw] wordlist = [] for i in range(N): words = extract_words(dlist[i], vocab=voc, use_bigrams=use_bigram) wordlist.append(words) wordlists[kw] = wordlist return wordlists
def get_boilerplate(self, X_df): """ get data from boilerplate using top words (transform) """ keywords = self.keywords N = X_df.shape[0] D = {kw:[] for kw in keywords} for i in range(N): observation = X_df[i,2] d = json.loads(observation) for kw in keywords: if kw in d and d[kw]: D[kw].append(d[kw]) else: D[kw].append('') wordlists = {} for kw in keywords: use_bigram = self.params[kw]['bgram'] voc = self.topwords[kw] dlist = D[kw] wordlist = [] for i in range(N): words = extract_words(dlist[i], vocab=voc, use_bigrams=use_bigram) wordlist.append(words) wordlists[kw] = wordlist return wordlists