def _prepare(self, data): dct = super(SelectNgramCounts, self)._prepare(data) if self.train_only: y = get_single_column(self.target.create(self.context)).reindex(self.context.train_index) x = data.reindex(self.context.train_index) else: y = get_single_column(self.target.create(self.context)) x = data cols = self.select(x, y) return cols, dct
def _prepare(self, data): dct = super(SelectNgramCounts, self)._prepare(data) if self.train_only: y = get_single_column(self.target.create(self.context)).reindex( self.context.train_index) x = data.reindex(self.context.train_index) else: y = get_single_column(self.target.create(self.context)) x = data cols = self.select(x, y) return cols, dct
def _create(self, data): data = get_single_column(data) vecs = None #self.load('topic_vecs') if vecs is None or self.force: vecs = self.make_vectors(data) vecs.columns = ['%s_%s'%(c, data.name) for c in vecs.columns] return vecs
def _create(self, data): data = get_single_column(data) vecs = None #self.load('topic_vecs') if vecs is None or self.force: vecs = self.make_vectors(data) vecs.columns = ['%s_%s' % (c, data.name) for c in vecs.columns] return vecs
def _prepare(self, data): data = get_single_column(data) docs = list(data) if self.verbose: print docs[:10] dct = self.dictionary.get_dict(self.context, docs) if self.verbose: print dct return dct
def _create(self, data): dct = self.get_prep_data(data) data = get_single_column(data) docs = [dct.doc2bow(d) for d in data] ids = dct.keys() df = DataFrame([dict(row) for row in docs], columns=ids, index=data.index) df.columns = ["%s_%s" % (dct[i], data.name) for i in ids] df = df.fillna(0) if self.bool_: df = df.astype(bool).astype(int) return df
def _create(self, data): dct = self.get_prep_data(data) data = get_single_column(data) docs = [dct.doc2bow(d) for d in data] ids = sorted(dct.keys()) df = DataFrame([dict(row) for row in docs], columns=ids, index=data.index) df.columns = ['%s_%s' % (dct[i], data.name) for i in ids] df = df.fillna(0) if self.bool_: df = df.astype(bool).astype(int) return df
def combine(self, datas): datas = [get_single_column(d) for d in datas] d = [] for x in zip(*datas): d.append(self.sep.join(x)) return DataFrame(d, columns=['_'.join([c.name for c in datas])])