def run(self): df = sf.load_sframe(self.input()[0].fn)[self.train_field] with self.output().open('w') as out_fd: print "start to make vocab" vocab = Vocab(df) vocab.trim(self.no_below, self.no_above, self.keep_n) vocab.save(out_fd)
def run(self): df = sf.load_sframe(self.input()[0].fn) sampled_df = df.sample(self.train_fraction) print "sampled %d documents" % sampled_df.num_rows() delete_cols = [col for col in sampled_df.column_names() if col != self.train_field] sampled_df.remove_columns(delete_cols) sampled_df.save(self.output().fn)
def run(self): df = sf.load_sframe(self.input()[0]['user'].fn) self.wordset = WordSet(self.input()[1].fn) df['ensemble'] = df.apply(self.ensemble) df = df.select_columns(['id', 'ensemble']) df = df[df['ensemble'].apply(lambda x: 1 if len(x) > 0 else 0)] df.export_csv(self.output().fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)
def run(self): df = sf.load_sframe(self.input()[0]['user'].fn) delete_cols = [col for col in df.column_names() if col != self.user_field and col != "id"] df.remove_columns(delete_cols) wordset = WordSet(self.input()[1].fn) df[self.user_field] = df[self.user_field].apply(wordset.filter_bows) df = df[df[self.user_field] != ""] df.export_csv(self.output().fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False)
def run(self): hdfs = luigi.contrib.hdfs.hadoopcli_clients.create_hadoopcli_client() df = sf.load_sframe(self.input()[0]['rec'].fn) delete_cols = [col for col in df.column_names() if col != "history" and col != "id" and col != "rlist"] df.remove_columns(delete_cols) df.export_csv(self.local_csv, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False) hbase_input_csv = "%s/user.rec.csv" % self.hbase_input_path hdfs.mkdir(self.hbase_input_path) hdfs.put(self.local_csv, hbase_input_csv) os.remove(self.local_csv) to_hbase(self.hbase_input_path, self.bin) hdfs.remove(self.hbase_input_path)
def merge_recommend(merged_fn, latest_rec_fn, latest_user_fn, latest_topic_fn): #read recommend df latest_rec_df = sf.SFrame.read_csv(latest_rec_fn, delimiter="\t", column_type_hints=[str, list], header=False) latest_rec_df.rename({"X1": "id", "X2": "rlist"}) #read history df latest_history_df = sf.load_sframe(latest_user_fn) delete_cols = [col for col in latest_history_df.column_names() if col != "history" and col != "id"] latest_history_df.remove_columns(delete_cols) #read topic df latest_topic_df = sf.SFrame.read_csv(latest_topic_fn, delimiter="\t", column_type_hints=[str, str], header=False) latest_topic_df.rename({"X1": "id", "X2": "fea"}) #join all latest_df = latest_history_df.join(latest_rec_df, on='id', how='left').join(latest_topic_df, on='id', how='left') if not os.path.exists(merged_fn): lasted_df.save(merged_fn) else: merged_df = sf.load_sframe(merged_fn) latest_id = latest_df.select_column("id") merged_df = merged_df.filter_by(latest_id, 'id', exclude=True) merged_df = merged_df.append(latest_df) merged_df.save(merged_fn)
def run(self): merged_rec_df = sf.load_sframe(self.input()[0]['rec'].fn) #import history print "import history" history_df = merged_rec_df.select_columns(['id', 'history']) history_df.export_csv(self.history_fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False) import_history(self.history_fn, self.host, self.db, self.user, self.passwd) os.remove(self.history_fn) #import rlist print "import recommendation" rlist_df = merged_rec_df.select_columns(['id', 'rlist']) rlist_df.export_csv(self.rlist_fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False) import_rlist(self.rlist_fn, self.host, self.db, self.user, self.passwd) os.remove(self.rlist_fn) #import user-topic-table print "import user-topic-table" fea_df = merged_rec_df.select_columns(['id', 'fea']) fea_df.export_csv(self.fea_fn, quote_level=csv.QUOTE_NONE, delimiter="\t", header=False) import_user_topic(self.fea_fn, self.host, self.db, self.user, self.passwd) os.remove(self.fea_fn)
'X.4': '#_of_words_in_intersection_no_stops', 'X.5': '#_of_words_in_union_no_stops', 'X.6': 'jaccard_similarity_no_stops', 'X.7': 'average_word_length_no_stops1', 'X.8': 'average_word_length_no_stops2', 'X.9': 'lists_identical_no_stops' } feature_names = features.values() + features_no_stops.values() + [ 'lower_identical' ] print 'start' print datetime.datetime.now() train = sframe.load_sframe('train_lists.sf') test = sframe.load_sframe('test_lists.sf') print 'files loaded' print datetime.datetime.now() # create features with and without stop words for train result = train.apply(lambda row: generate_features_from_lists( row['word_list1'], row['word_list2'])).unpack() train.add_columns(result.rename(features)) result = train.apply(lambda row: generate_features_from_lists( row['word_list_no_stops1'], row['word_list_no_stops2'])).unpack() train.add_columns(result.rename(features_no_stops)) train['lower_identical'] = train.apply( lambda row: lower_identical(row['question1'], row['question2']))
def run(self): df = sf.load_sframe(self.input()[0].fn) wordset = WordSet(self.input()[1].fn) df[self.train_field] = df[self.train_field].apply(wordset.filter_bows) df = df[df[self.train_field] != ""] df.export_csv(self.output().fn, delimiter="\t", quote_level=csv.QUOTE_NONE, header=False)
def run(self): df = sf.load_sframe(self.input()[0].fn) df = df.select_columns(['union']) sampled_df = df.sample(self.train_fraction) print "sampled %d documents" % sampled_df.num_rows() sampled_df.save(self.output().fn)