コード例 #1
0
    def get_boilerplate(self, X_df):
        """
        get data from boilerplate using top words (transform)
    """
        keywords = self.keywords
        N = X_df.shape[0]
        D = {kw: [] for kw in keywords}
        for i in range(N):
            observation = X_df[i, 2]
            d = json.loads(observation)
            for kw in keywords:
                if kw in d and d[kw]:
                    D[kw].append(d[kw])
                else:
                    D[kw].append('')

        wordlists = {}
        for kw in keywords:
            use_bigram = self.params[kw]['bgram']
            voc = self.topwords[kw]
            dlist = D[kw]
            wordlist = []
            for i in range(N):
                words = extract_words(dlist[i],
                                      vocab=voc,
                                      use_bigrams=use_bigram)
                wordlist.append(words)
            wordlists[kw] = wordlist
        return wordlists
コード例 #2
0
ファイル: model09.py プロジェクト: orazaro/stumbleupon_kaggle
 def get_boilerplate(self, X_df):
   """
       get data from boilerplate using top words (transform)
   """
   keywords = self.keywords
   N = X_df.shape[0]
   D = {kw:[] for kw in keywords}
   for i in range(N):
       observation = X_df[i,2]
       d = json.loads(observation)
       for kw in keywords:
           if kw in d and d[kw]:
               D[kw].append(d[kw])
           else:
               D[kw].append('')
   
   wordlists = {}
   for kw in keywords:
       use_bigram = self.params[kw]['bgram']
       voc = self.topwords[kw]
       dlist = D[kw]
       wordlist = []
       for i in range(N):
           words = extract_words(dlist[i], vocab=voc,
               use_bigrams=use_bigram)
           wordlist.append(words)
       wordlists[kw] = wordlist
   return wordlists