def __iter__(self): return stream.iter_csv( self.path, target='passengers', converters={'passengers': int}, parse_dates={'month': '%Y-%m'} )
def _stream_X_y(self, directory): return stream.iter_csv( f'{directory}/trec07p.csv', target='y', delimiter=',', quotechar='"', field_size_limit=1_000_000, )
def _iter(self): return stream.iter_csv( self.path, target='y', delimiter=',', quotechar='"', field_size_limit=1_000_000, )
def _stream_X_y(self, directory): return stream.iter_csv(f'{directory}/smtp.csv', target='service', converters={ 'duration': float, 'src_bytes': float, 'dst_bytes': float, 'service': int })
def __iter__(self): return stream.iter_csv(self.path, target='weight', converters={ 'time': int, 'weight': int, 'chick': int, 'diet': int })
def _iter(self): return stream.iter_csv(self.path, target='service', converters={ 'duration': float, 'src_bytes': float, 'dst_bytes': float, 'service': int })
def _iter(self): return stream.iter_csv(self.path, target='rating', converters={ 'timestamp': int, 'release_date': int, 'age': float, 'rating': float }, delimiter='\t')
def _iter(self): return stream.iter_csv(self.path, target='visitors', converters={ 'latitude': float, 'longitude': float, 'visitors': int, 'is_holiday': ast.literal_eval }, parse_dates={'date': '%Y-%m-%d'})
def _iter(self): converters = {f'V{i}': float for i in range(1, 29)} converters['Class'] = int converters['Time'] = float converters['Amount'] = float return stream.iter_csv(self.path, target='Class', converters=converters)
def __iter__(self): return stream.iter_csv(self.path, target='five_thirty_eight', converters={ 'ordinal_date': int, 'gallup': float, 'ipsos': float, 'morning_consult': float, 'rasmussen': float, 'you_gov': float, 'five_thirty_eight': float })
def _iter(self): return stream.iter_csv(self.path, target='bikes', converters={ 'clouds': int, 'humidity': int, 'pressure': float, 'temperature': float, 'wind': float, 'bikes': int }, parse_dates={'moment': '%Y-%m-%d %H:%M:%S'})
def _stream_X_y(self, directory): return stream.iter_csv( os.path.join(directory, 'trump_approval.csv.gz'), target='five_thirty_eight', converters={ 'ordinal_date': int, 'gallup': float, 'ipsos': float, 'morning_consult': float, 'rasmussen': float, 'you_gov': float, 'five_thirty_eight': float } )
def _iter(self): return stream.iter_csv(self.path, target='class', converters={ 'date': float, 'day': int, 'period': float, 'nswprice': float, 'nswdemand': float, 'vicprice': float, 'vicdemand': float, 'transfer': float, 'class': lambda x: x == 'UP' })
def _stream_X_y(self, directory): return stream.iter_csv( f'{directory}/toulouse_bikes.csv', target='bikes', converters={ 'clouds': int, 'humidity': int, 'pressure': float, 'temperature': float, 'wind': float, 'bikes': int }, parse_dates={'moment': '%Y-%m-%d %H:%M:%S'} )
def _iter(self): return stream.iter_csv( self.path, target='trip_duration', converters={ 'passenger_count': int, 'pickup_longitude': float, 'pickup_latitude': float, 'dropoff_longitude': float, 'dropoff_latitude': float, 'trip_duration': int }, parse_dates={'pickup_datetime': '%Y-%m-%d %H:%M:%S'}, drop=['dropoff_datetime', 'id'])
def _stream_X_y(self, directory): return stream.iter_csv( f'{directory}/train.csv', target='trip_duration', converters={ 'passenger_count': int, 'pickup_longitude': float, 'pickup_latitude': float, 'dropoff_longitude': float, 'dropoff_latitude': float, 'trip_duration': int }, parse_dates={'pickup_datetime': '%Y-%m-%d %H:%M:%S'}, drop=['dropoff_datetime', 'id'] )
def __iter__(self): return stream.iter_csv( self.path, target='is_phishing', converters={ 'empty_server_form_handler': float, 'popup_window': float, 'https': float, 'request_from_other_domain': float, 'anchor_from_other_domain': float, 'is_popular': float, 'long_url': float, 'age_of_domain': int, 'ip_in_url': int, 'is_phishing': lambda x: x == '1' } )
def _iter(self): features = [ 'lepton pT', 'lepton eta', 'lepton phi', 'missing energy magnitude', 'missing energy phi', 'jet 1 pt', 'jet 1 eta', 'jet 1 phi', 'jet 1 b-tag', 'jet 2 pt', 'jet 2 eta', 'jet 2 phi', 'jet 2 b-tag', 'jet 3 pt', 'jet 3 eta', 'jet 3 phi', 'jet 3 b-tag', 'jet 4 pt', 'jet 4 eta', 'jet 4 phi', 'jet 4 b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb' ] return stream.iter_csv(self.path, fieldnames=['is_signal', *features], target='is_signal', converters={ 'is_signal': lambda x: x.startswith('1'), **{f: float for f in features} })
def __iter__(self): return stream.iter_csv( self.path, target=['c-class-flares', 'm-class-flares', 'x-class-flares'], converters={ 'zurich-class': str, 'largest-spot-size': str, 'spot-distribution': str, 'activity': int, 'evolution': int, 'previous-24h-flare-activity': int, 'hist-complex': int, 'hist-complex-this-pass': int, 'area': int, 'largest-spot-area': int, 'c-class-flares': int, 'm-class-flares': int, 'x-class-flares': int })
def __iter__(self): return stream.iter_csv(self.path, target='category', converters={ 'region-centroid-col': int, 'region-centroid-row': int, 'short-line-density-5': float, 'short-line-density-2': float, 'vedge-mean': float, 'vegde-sd': float, 'hedge-mean': float, 'hedge-sd': float, 'intensity-mean': float, 'rawred-mean': float, 'rawblue-mean': float, 'rawgreen-mean': float, 'exred-mean': float, 'exblue-mean': float, 'exgreen-mean': float, 'value-mean': float, 'saturation-mean': float, 'hue-mean': float })
BATCH_SIZE = 256 write_dataset('train.csv', db['features'][:SPLIT_INDEX], db['labels'][:SPLIT_INDEX], BATCH_SIZE) write_dataset('test.csv', db['features'][SPLIT_INDEX:], db['labels'][SPLIT_INDEX:], BATCH_SIZE) FEATURE_SIZE = db['features'].shape[1] types = {f'feature_{i}': float for i in range(FEATURE_SIZE)} types['class'] = int model = StandardScaler() model |= OneVsRestClassifier(LogisticRegression()) metric = Accuracy() dataset = stream.iter_csv('train.csv', target_name='class', converters=types) print('Training started...') for i, (X, y) in enumerate(dataset): predictions = model.predict_one(X) model = model.fit_one(X, y) metric = metric.update(y, predictions) if i % 100 == 0: print(f'Update {i} - {metric}') print(f'Final - {metric}') metric = Accuracy() test_dataset = stream.iter_csv('test.csv', target_name='class', converters=types)
def _stream_X_y(self, directory): return stream.iter_csv(os.path.join(directory, 'airline-passengers.csv'), target='passengers', converters={'passengers': int}, parse_dates={'month': '%Y-%m'})
required=True, help='Path to test CSV file.') argument_parser.add_argument( '-n', '--num-cols', type=int, required=True, help='Number of columns in the feature CSV file (excluding label).') arguments = vars(argument_parser.parse_args()) print('[INFO] Building column names...') types = {f'feature_{i}': float for i in range(arguments['num_cols'])} # Data type per feature types['class'] = int dataset = stream.iter_csv(arguments['train'], target_name='class', types=types) model = Pipeline([('scaler', StandardScaler()), ('learner', OneVsRestClassifier(binary_classifier=PAClassifier()))]) metric = Accuracy() print('[INFO] Training started...') for index, (X, y) in enumerate(dataset): try: predictions = model.predict_one(X) model = model.fit_one(X, y) metric = metric.update(y, predictions) if index % 10 == 0:
ap.add_argument("-c", "--csv", required=True, help="path to features CSV file") ap.add_argument("-n", "--cols", type=int, required=True, help="# of feature columns in the CSV file (excluding class column") args = vars(ap.parse_args()) # construct our data dictionary which maps the data types of the # columns in the CSV file to built-in data types print("[INFO] building column names...") types = {"feat_{}".format(i): float for i in range(0, args["cols"])} types["class"] = int # create a CSV data generator for the extracted Keras features dataset = stream.iter_csv(args["csv"], target_name="class", types=types) # construct our pipeline model = Pipeline([ ("scale", StandardScaler()), ("learn", OneVsRestClassifier(binary_classifier=LogisticRegression()))]) # initialize our metric print("[INFO] starting training...") metric = Accuracy() # loop over the dataset for (i, (X, y)) in enumerate(dataset): # make predictions on the current set of features, train the # model on the features, and then update our metric preds = model.predict_one(X)
PATH_TO_CSV = 'bbc-text.csv' start_time = time.time() logger = logging.getLogger() logger.setLevel(logging.WARN) logging.warn('\tLoading word embeddings and data streamer...') nlp = spacy.load('en_core_web_md') encodings = { 'tech': 0, 'business': 1, 'sport': 2, 'entertainment': 3, 'politics': 4 } types = {"category": str} dataset = stream.iter_csv(PATH_TO_CSV, target_name="category", types=types) stop_time = time.time() elapsed_time = stop_time - start_time logging.info('\tFinished in {0} seconds.'.format(elapsed_time)) classifier = MLPClassifier(activation='tanh', learning_rate='constant', alpha=1e-4, hidden_layer_sizes=(15, ), random_state=1, batch_size=16, verbose=False, max_iter=20, warm_start=True) predictions = []
"-n", "--num_cols", type=int, required=True, help="# of feature columns in the CSV file (excluding class column") args = vars(ap.parse_args()) # construct our data dictionary which maps the data types of the # columns in the CSV file to built-in data types print("[INFO] building column names...") types = {f'feat_{i}': float for i in range(args['num_cols'])} types["class"] = int # create a CSV data generator for the extracted Keras features dataset = stream.iter_csv(filepath_or_buffer=args["csv"], target_name="class", converters=types) # construct our pipeline model = Pipeline(StandardScaler(), OneVsRestClassifier(binary_classifier=PAClassifier())) # initialize our metric print("[INFO] starting training...") metric = ClassificationReport() # loop over the dataset for i, (X, y) in enumerate(dataset): # make predictions on the current set of features, train the # model on the features, and then update our metric preds = model.predict_one(X)
ap.add_argument( "-n", "--cols", type=int, required=True, help="# of feature columns in the CSV file (excluding class column") args = vars(ap.parse_args()) # construct our data dictionary which maps the data types of the # columns in the CSV file to built-in data types print("[INFO] building column names...") types = {"feat_{}".format(i): float for i in range(0, args["cols"])} types["class"] = int # create a CSV data generator for the extracted Keras features dataset = stream.iter_csv(args["csv"], target="class", converters=types) # construct our pipeline (maybe set to .0000003) model = Pipeline(StandardScaler(), LogisticRegression(optimizer=optim.SGD(.0000001))) # initialize our metric print("[INFO] starting training...") metric = Accuracy() # loop over the dataset for (i, (X, y)) in enumerate(dataset): # make predictions on the current set of features, train the # model on the features, and then update our metric preds = model.predict_one(X) model = model.fit_one(X, y) metric = metric.update(y, preds)
def _stream_X_y(self, directory): return stream.iter_csv(f'{directory}', target=[ 'amazed-suprised', 'happy-pleased', 'relaxing-clam', 'quiet-still', 'sad-lonely', 'angry-aggresive' ], converters={ 'amazed-suprised': lambda x: x == '1', 'happy-pleased': lambda x: x == '1', 'relaxing-clam': lambda x: x == '1', 'quiet-still': lambda x: x == '1', 'sad-lonely': lambda x: x == '1', 'angry-aggresive': lambda x: x == '1', 'Mean_Acc1298_Mean_Mem40_Centroid': float, 'Mean_Acc1298_Mean_Mem40_Rolloff': float, 'Mean_Acc1298_Mean_Mem40_Flux': float, 'Mean_Acc1298_Mean_Mem40_MFCC_0': float, 'Mean_Acc1298_Mean_Mem40_MFCC_1': float, 'Mean_Acc1298_Mean_Mem40_MFCC_2': float, 'Mean_Acc1298_Mean_Mem40_MFCC_3': float, 'Mean_Acc1298_Mean_Mem40_MFCC_4': float, 'Mean_Acc1298_Mean_Mem40_MFCC_5': float, 'Mean_Acc1298_Mean_Mem40_MFCC_6': float, 'Mean_Acc1298_Mean_Mem40_MFCC_7': float, 'Mean_Acc1298_Mean_Mem40_MFCC_8': float, 'Mean_Acc1298_Mean_Mem40_MFCC_9': float, 'Mean_Acc1298_Mean_Mem40_MFCC_10': float, 'Mean_Acc1298_Mean_Mem40_MFCC_11': float, 'Mean_Acc1298_Mean_Mem40_MFCC_12': float, 'Mean_Acc1298_Std_Mem40_Centroid': float, 'Mean_Acc1298_Std_Mem40_Rolloff': float, 'Mean_Acc1298_Std_Mem40_Flux': float, 'Mean_Acc1298_Std_Mem40_MFCC_0': float, 'Mean_Acc1298_Std_Mem40_MFCC_1': float, 'Mean_Acc1298_Std_Mem40_MFCC_2': float, 'Mean_Acc1298_Std_Mem40_MFCC_3': float, 'Mean_Acc1298_Std_Mem40_MFCC_4': float, 'Mean_Acc1298_Std_Mem40_MFCC_5': float, 'Mean_Acc1298_Std_Mem40_MFCC_6': float, 'Mean_Acc1298_Std_Mem40_MFCC_7': float, 'Mean_Acc1298_Std_Mem40_MFCC_8': float, 'Mean_Acc1298_Std_Mem40_MFCC_9': float, 'Mean_Acc1298_Std_Mem40_MFCC_10': float, 'Mean_Acc1298_Std_Mem40_MFCC_11': float, 'Mean_Acc1298_Std_Mem40_MFCC_12': float, 'Std_Acc1298_Mean_Mem40_Centroid': float, 'Std_Acc1298_Mean_Mem40_Rolloff': float, 'Std_Acc1298_Mean_Mem40_Flux': float, 'Std_Acc1298_Mean_Mem40_MFCC_0': float, 'Std_Acc1298_Mean_Mem40_MFCC_1': float, 'Std_Acc1298_Mean_Mem40_MFCC_2': float, 'Std_Acc1298_Mean_Mem40_MFCC_3': float, 'Std_Acc1298_Mean_Mem40_MFCC_4': float, 'Std_Acc1298_Mean_Mem40_MFCC_5': float, 'Std_Acc1298_Mean_Mem40_MFCC_6': float, 'Std_Acc1298_Mean_Mem40_MFCC_7': float, 'Std_Acc1298_Mean_Mem40_MFCC_8': float, 'Std_Acc1298_Mean_Mem40_MFCC_9': float, 'Std_Acc1298_Mean_Mem40_MFCC_10': float, 'Std_Acc1298_Mean_Mem40_MFCC_11': float, 'Std_Acc1298_Mean_Mem40_MFCC_12': float, 'Std_Acc1298_Std_Mem40_Centroid': float, 'Std_Acc1298_Std_Mem40_Rolloff': float, 'Std_Acc1298_Std_Mem40_Flux': float, 'Std_Acc1298_Std_Mem40_MFCC_0': float, 'Std_Acc1298_Std_Mem40_MFCC_1': float, 'Std_Acc1298_Std_Mem40_MFCC_2': float, 'Std_Acc1298_Std_Mem40_MFCC_3': float, 'Std_Acc1298_Std_Mem40_MFCC_4': float, 'Std_Acc1298_Std_Mem40_MFCC_5': float, 'Std_Acc1298_Std_Mem40_MFCC_6': float, 'Std_Acc1298_Std_Mem40_MFCC_7': float, 'Std_Acc1298_Std_Mem40_MFCC_8': float, 'Std_Acc1298_Std_Mem40_MFCC_9': float, 'Std_Acc1298_Std_Mem40_MFCC_10': float, 'Std_Acc1298_Std_Mem40_MFCC_11': float, 'Std_Acc1298_Std_Mem40_MFCC_12': float, 'BH_LowPeakAmp': float, 'BH_LowPeakBPM': int, 'BH_HighPeakAmp': float, 'BH_HighPeakBPM': int, 'BH_HighLowRatio': int, 'BHSUM1': float, 'BHSUM2': float, 'BHSUM3': float })