def build_model(self, **kwargs):
        ti = kwargs['ti']
        streams = ti.xcom_pull(task_ids='load_stream')
        users = ti.xcom_pull(task_ids='load_users')
        ab_user = ti.xcom_pull(task_ids='label')

        # preprocessing
        users['abnormal'] = [0] * users.shape[0]
        users['birth_year'].loc[users['birth_year'] == ''] = users[
            'birth_year'].value_counts().index[0]
        users['birth_year'] = users['birth_year'].astype('int')
        for i in ['access', 'gender']:
            le = preprocessing.LabelEncoder()
            users[i] = le.fit_transform(users[i])

        # feature engineering
        vectorizer = CountVectorizer()
        streams = streams.groupby('user_id')['track_id'].apply(
            lambda x: ' '.join(x))
        counts = vectorizer.fit_transform(streams)

        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(counts)

        als = implicit.als.AlternatingLeastSquares(factors=50)
        als.fit(tfidf.T)  # item_user matrix
        user_factors = pd.DataFrame(data=als.user_factors)

        users = users.sort_values(by='user_id')
        users.reset_index(drop=True, inplace=True)
        user_factors.reset_index(drop=True, inplace=True)
        users = pd.concat([users, user_factors], axis=1, ignore_index=True)
        users.rename(columns={
            0: 'access',
            1: 'birth_year',
            2: 'country',
            3: 'gender',
            4: 'user_id',
            5: 'abnormal'
        },
                     inplace=True)

        # set X, y
        y = users['abnormal']
        X = users.drop(['abnormal', 'user_id'], axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.3,
                                                            random_state=42)

        # training
        clf = RandomForestClassifier(n_jobs=10)
        clf.fit(X_train, y_train)
        clf.score(X_test, y_test)

        # testing
        y_pred = clf.predict(X_test)
        cf = pd.DataFrame(data=confusion_matrix(y_test, y_pred))
        gcs.Bucket('ru_test').item('cf.csv').write_to(cf.to_csv(index=False),
                                                      'text/csv')
Example #2
0
class TabsMR(object):

    def __init__(self):
        self.rawdata = None
        self.access_granted = False # digunakan untuk verify api
        self.access_message = None
        self.bucket_name = None

        self.registered_rawdata = False

    def credentials(self, certificate):
        """
        certificate = {
            "account_type": "tabulation",
            "client":"clientname",
            "project_id": "project_id",
            "private_key_id": "a4a9f3c7600081ea9bad46ece1b158e2f16454e2",
            "user_email": "*****@*****.**",
            "user_id": "101670599119528512817",
        }

        """
        self.certificate = certificate
	
        conn = psycopg2.connect(
            host='localhost',
            port=54320,
            dbname='my_database',
            user='******',
        )
        cur = conn.cursor()
        cur.execute("""
            SELECT private_key_id FROM public.credentials
            WHERE account_type={0}
                AND project_id={0}
                AND user_email={0}
                AND user_id={0};""".format(
                    certificate["account_type"], certificate["project_id"],
                    certificate["user_email"], certificate["user_id"]
                ))
        rec = cur.fectone()

        if rec["private_key_id"] == certificate["private_key_id"]:
            self.access_granted = True
            self.access_message = "Your access credentials is granted!"
        else:
            self.access_message = "Your access credentials is denied, Please recheck your key!"

    def do_register_rawdata(self, csv_path):
        """
        register rawdata ke google storage bucket
        """
        if self.registered_rawdata == False:
    ΒΈ       a = pd.read_csv(csv_path)
            gcs.Bucket(self.bucket_name).item(certificate["clientname"]+"/dataset_"+certificate["project_id"] +".csv")\
                                        .write_to(a.to_csv(),'text/csv')
            self.registered_rawdata == True
            print("Your rawdata is already registered")
Example #3
0
def update_record(station, weather):
    print(f"Updating {station['address']}")
    if "/" in station['address']:
        data = pd.read_csv(
            'gs://dbikes-planner.appspot.com/station_records/Princes Street.csv'
        )
    else:
        data = pd.read_csv(
            f"gs://dbikes-planner.appspot.com/station_records/{station['address']}.csv"
        )

    epoch_time = station['last_update']

    entry_datetime = datetime.fromtimestamp(epoch_time / 1000)

    if is_time_between(time(3, 30), time(5, 0), entry_datetime.time()):
        return

    last_line = data.tail(3).to_csv()

    if entry_datetime.isoformat() in last_line:
        return

    day_index = entry_datetime.weekday()

    if day_index <= 4:
        day_type = 0
    else:
        day_type = 10

    new_row = {
        'available_bikes': station['available_bikes'],
        'available_bike_stands': station['available_bike_stands'],
        'time_of_day': datetime_to_seconds(entry_datetime),
        'type_of_day': day_type,
        'day_of_year': entry_datetime.timetuple().tm_yday,
        'iso_date': entry_datetime.isoformat(),
        'temperature': weather['temperature'],
        'relative_humidity': weather['humidity'],
        'wind_speed': weather['wind_speed'],
        'rain': weather['rain'],
        'visibility': weather['visibility'],
        'bike_availability': category(station['available_bikes']),
        'bike_stand_availability': category(station['available_bike_stands']),
        'unix_timestamp': entry_datetime.timestamp() // 3600
    }

    new_row_dataframe = pd.DataFrame(new_row, index=[0])

    combined_df = pd.concat([data, new_row_dataframe], ignore_index=True)

    gcs.Bucket('dbikes-planner.appspot.com').item(f'station_records/{station["address"]}.csv') \
        .write_to(combined_df.to_csv(index=False), 'text/csv')
 def load_stream(self, **kwargs):
     os.environ[
         'GOOGLE_APPLICATION_CREDENTIALS'] = '/disk/ru/My First Project-d1196e9f3e13.json'
     s = str(gcs.Bucket('ru_test').item('streams').read_from(), 'utf-8')
     streams = pd.read_json(StringIO(s), lines=True)
     streams['timestamp'] = pd.to_datetime(streams['timestamp'],
                                           format='%Y-%m-%d %H:%M:%S')
     streams.sort_values(by=['timestamp'], inplace=True)
     streams.reset_index(inplace=True, drop=True)
     streams['delta'] = streams.groupby(
         ['user_id'])['timestamp'].diff().dt.total_seconds()
     return streams
Example #5
0
from datalab.context import Context
import datalab.storage as storage
import datalab.bigquery as bq
import pandas as pd

# Dataframe to write
simple_dataframe = pd.DataFrame(data=[{1,2,3},{4,5,6}],columns=['a','b','c'])

sample_bucket_name = Context.default().project_id + '-datalab-example'
sample_bucket_path = 'gs://' + sample_bucket_name
sample_bucket_object = sample_bucket_path + '/Hello.txt'
bigquery_dataset_name = 'TestDataSet'
bigquery_table_name = 'TestTable'

# Define storage bucket
sample_bucket = storage.Bucket(sample_bucket_name)

# Create storage bucket if it does not exist
if not sample_bucket.exists():
    sample_bucket.create()

# Define BigQuery dataset and table
dataset = bq.Dataset(bigquery_dataset_name)
table = bq.Table(bigquery_dataset_name + '.' + bigquery_table_name)

# Create BigQuery dataset
if not dataset.exists():
    dataset.create()

# Create or overwrite the existing table if it exists
table_schema = bq.Schema.from_dataframe(simple_dataframe)
Example #6
0
def write_to_storage(bucket_name, df_to_write, file_name_in_bucket):
    gcs.Bucket(bucket_name).item(file_name_in_bucket).write_to(
        df_to_write.to_csv(), 'text/csv')
Example #7
0
              Gini(train_label[valid_index], cv_train[valid_index]))
        cv_pred += model.predict(x=X_test, batch_size=512, verbose=0)[:, 0]

    print(Gini(train_label, cv_train))
    result = pd.DataFrame({'id': test_id, 'target': cv_pred * 1. / (NFOLDS)})
    return result


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir',
                        help='GCS or local path to training and testing data',
                        required=True)
    parser.add_argument('--output-name',
                        help='GCS or local path to training and testing data',
                        required=True)
    args = parser.parse_args()

    # v1
    # arguments = args.__dict__
    # data_dir = arguments.pop("data_dir")
    # output_name= arguments.pop("output_name")
    #v2
    data_dir = args.data_dir
    output_name = args.output_name

    outputpath = "Potro/output/" + str(output_name) + '.csv'
    result = main(data_dir)
    gcs.Bucket('stevenwho').item(outputpath).write_to(result.to_csv(),
                                                      'text/csv')
    def train(self):
        self.train_model = Model(self.max_size) # to be changed
        batchsize = 275
        self.train_model.build_model()
        saver = tf.train.Saver(max_to_keep=10)
        summary_proto = tf.Summary()
        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
            sess.run(init)
            writer_file_path = os.path.join(FLAGS.output_dir, FLAGS.experiment_name, 'improved_graph')
            checkpoint_file = os.path.join(FLAGS.output_dir, FLAGS.experiment_name, 'checkpoints')
            writer = tf.summary.FileWriter(writer_file_path, sess.graph)
            for epoch in range(0, self.epochs):
                print("Epoch number " + str(epoch))
                batch_idx = 0
                training_loss = 0.0
                for batch in self.shuffle_batches(self.X_train, self.y_train, batchsize):
                    inputs, targets = batch
                    feed_dict = {self.train_model.x: inputs,
                                 self.train_model.y: targets,
                                 self.train_model.is_training: True}
                    global_step, summary_train, accuracy, network_loss, _ = sess.run([self.train_model.global_step,
                                                                                   self.train_model.summary_ops,
                                                                                   self.train_model.acc_op,
                                                                                   self.train_model.loss,
                                                                                   self.train_model.train_op],
                                                                                   feed_dict=feed_dict)
                    training_loss += network_loss
                    batch_idx += 1
                    writer.add_summary(summary_train, global_step=global_step)
                    if batch_idx % 1 == 0:
                        print('Epoch ', epoch, ' and Batch ', batch_idx, ' | training loss is ',
                              training_loss / batch_idx)
                    # if batch_idx % 10 == 0:
                    #     saver.save(sess, checkpoint_file, global_step=global_step)
                    #     summary_proto.ParseFromString(summary_train)
                num_of_training_batches = batch_idx
                validation_loss = 0.
                batch_idx = 0
                #VALIDATION
                
                validation_feed = {self.train_model.x: self.X_test,
                                   self.train_model.y: self.y_test,
                                   self.train_model.is_training: False}
                [predicted_classes] = sess.run([self.train_model.pred_classes],
                                                          feed_dict=validation_feed)
                predicted_classes = predicted_classes
                print(predicted_classes)
                test_acc = f1_score(predicted_classes, np.array(self.y_test), average='micro')
                print('Epoch ', epoch, ' got score of  ', test_acc)

                #FINAL TESTINg
                testing_feed = {self.train_model.x: self.test_input,
                                self.train_model.is_training: False}
                [predicted_classes] = sess.run([self.train_model.pred_classes],
                                                        feed_dict=testing_feed)
                test_input_pred = predicted_classes
                predicted_output = {'y': test_input_pred}
                predicted_output_df = pd.DataFrame(data=predicted_output)
                #predicted_output_df.set_index('id')
                if epoch % 5 == 0:
                    print('Epoch ', epoch, ' saved a new estimated file')
                    gcs.Bucket('aml-project3').item('output_deep/data.csv').write_to(predicted_output_df.to_csv(index_label='id'),'text/csv')
Example #9
0
def read_csv(file, nrows,usecols=all):
    stream =  dlb_storage.Bucket('project-bagandata').item(file).read_from()
    data = pd.read_csv(BytesIO(stream), engine='c', nrows=nrows, low_memory=False, header='infer', usecols=usecols)
    return data
 def load_users(self, **kwargs):
     os.environ[
         'GOOGLE_APPLICATION_CREDENTIALS'] = '/disk/ru/My First Project-d1196e9f3e13.json'
     s = str(gcs.Bucket('ru_test').item('users').read_from(), 'utf-8')
     users = pd.read_json(StringIO(s), lines=True)
     return users