def execute(): app = request.forms['app'] user = request.forms['user'] cid = request.forms['cid'] desc = request.forms['desc'] np = request.forms['np'] appmod = pickle.loads(request.forms['appmod']) # remove the appmod key del request.forms['appmod'] appmod.write_params(request.forms, user) # if preprocess is set run the preprocessor try: if appmod.preprocess: run_params, _, _ = appmod.read_params(user, cid) base_dir = os.path.join(user_dir, user, app) process.preprocess(run_params, appmod.preprocess, base_dir) if appmod.preprocess == "terra.in": appmod.outfn = "out" + run_params['casenum'] + ".00" except: return template('error', err="There was an error with the preprocessor") # submit job to queue try: priority = db(users.user == user).select( users.priority).first().priority uid = users(user=user).id jid = sched.qsub(app, cid, uid, np, priority, desc) return str(jid) #redirect("http://localhost:"+str(config.port)+"/case?app="+str(app)+"&cid="+str(cid)+"&jid="+str(jid)) except OSError: return "ERROR: a problem occurred"
def __init__(self, input_files, label_files=None, batch_size=1, label_types=None, tile_inputs=False): self.input_files = input_files self.label_files = label_files self.inputs = np.array([preprocess(file, resize=True, tile=tile_inputs) for file in input_files]) self.inputs = np.reshape(self.inputs, (-1,) + self.inputs.shape[2:]) if concat_files is not None: concats = [[preprocess(file, resize=True, tile=tile_inputs) for file in channel] for channel in concat_files] concats = np.reshape(concats, (-1,) + self.inputs.shape[2:-1] + (len(concats),)) self.inputs = np.concatenate((self.inputs, concats*)) if label_files is not None: self.labels = np.array([preprocess(file, resize=True, tile=tile_inputs) for file in label_files]) self.labels = np.reshape(self.labels, (-1,) + self.labels.shape[2:]) else: self.labels = None self.batch_size = batch_size self.label_types = label_types self.tile_inputs = tile_inputs self.n = len(self.inputs) self.idx = 0
def light_gbm_predict(do_not_pre_classes): from process import preprocess preprocess() x_test = pd.read_csv(r"../user_data/x_test.csv").iloc[:, 1:] x_test['age'].fillna(42.627019408001736, inplace=True) print(x_test) rows_number = x_test.iloc[:, 0].size clf_list = load_model("model.ml") pred_list = [] name2idx, idx2name = read_class_name(config.arrythmia) for i in tqdm(range(34)): if i in do_not_pre_classes: pred_list.append(np.zeros(rows_number)) print("skip .........") continue clf = clf_list[i] p_test = clf.predict(x_test) pred_list.append(p_test) return np.array(pred_list).T
def __getitem__(self, idx): batch = [] for file in self.inputs[self.batch_size * idx:self.batch_size * (idx + 1)]: if self.load_files: volume = file elif self.concat is None: volume = preprocess(file, self.funcs) else: volume = np.concatenate((preprocess(file, self.funcs), self.concat), axis=-1) batch.append(volume) batch = np.array(batch) if self.seeds is not None: seeds = [] for file in self.seeds[self.batch_size * idx:self.batch_size * (idx + 1)]: seed = file if self.load_files else preprocess(file, ['resize']) seeds.append(seed) batch = np.concatenate((batch, np.array(seeds)), axis=-1) if self.seed_type is not None: if self.labels is None: raise ValueError('No labels to generate slices.') if self.seeds is not None: raise ValueError('Seeds already exist.') new_batch = np.zeros(tuple(list(batch.shape[:-1]) + [batch.shape[-1] + 1])) for i, file in enumerate(self.labels[self.batch_size * idx:self.batch_size * (idx + 1)]): label = file if self.load_files else preprocess(file, ['resize']) if self.seed_type == 'slice': seed = np.zeros(batch[i].shape) r = np.random.choice(label.shape[0]) while not np.any(label[r]): r = np.random.choice(label.shape[0]) seed[r] = label[r] elif self.seed_type == 'volume': seed = label.copy() new_batch[i] = np.concatenate((batch[i], seed), axis=-1) batch = new_batch if self.include_labels: if self.labels is None: raise ValueError('No labels provided.') labels = [] for file in self.labels[self.batch_size * idx:self.batch_size * (idx + 1)]: label = file if self.load_files else preprocess(file, ['resize']) labels.append(label) labels = np.array(labels) batch = (batch, labels) return batch
def _get_batch(self, index_array): batch = [] if self.label_types is None: for _, i in enumerate(index_array): if self.load_files: x = self.inputs[i] elif self.tile_inputs: x = preprocess(self.inputs[i], tile=self.tile_inputs)[i%8] elif self.random_gen: s = self.samples[i] n = np.random.choice(self.frames[s]) x = preprocess(_format(self.input_file_format, s, n), resize=self.resize, tile=self.tile_inputs) else: x = preprocess(self.inputs[i], resize=self.resize) if self.augment: x = self.image_transformer.random_transform(x, seed=self.seed) batch.append(x) return np.asarray(batch) labels = [] for _, i in enumerate(index_array): if self.load_files: x = self.inputs[i] y = self.labels[i] elif self.tile_inputs: x = preprocess(self.inputs[i], tile=self.tile_inputs)[i%8] y = preprocess(self.labels[i], tile=self.tile_inputs)[i%8] elif self.random_gen: s = self.samples[i] n = np.random.choice(self.frames[s]) x = preprocess(_format(self.input_file_format, s, n), resize=self.resize, tile=self.tile_inputs) y = preprocess(_format(self.label_file_format, s, n), resize=self.resize, tile=self.tile_inputs) else: x = preprocess(self.inputs[i], resize=self.resize) y = preprocess(self.labels[i], resize=self.resize) if self.augment: x, y = self.image_transformer.random_transform(x, y, seed=self.seed) batch.append(x) labels.append(y) all_labels = [] for label_type in self.label_types: if label_type == 'label': if self.labels is None: raise ValueError('Labels not provided.') all_labels.append(labels) elif label_type == 'input': all_labels.append(batch) else: raise ValueError(f'Label type {label_type} is not supported.') if len(all_labels) == 1: all_labels = all_labels[0] return (np.asarray(batch), np.asarray(all_labels))
def __init__(self, input_files, seed_files=None, label_files=None, batch_size=1, seed_type=None, crop_size=constants.SHAPE, concat_files=None, load_files=False, include_labels=False, rescale=True): self.inputs = input_files self.seeds = seed_files self.labels = label_files self.batch_size = batch_size self.seed_type = seed_type self.crop_size = crop_size self.concat = None self.load_files = load_files self.include_labels = include_labels self.rescale = rescale self.shape = shape(input_files[0]) self.n = len(input_files) self.idx = 0 if concat_files is not None: self.concat = np.concatenate( (preprocess(concat_files[0], resize=True, rescale=True), preprocess(concat_files[1], resize=True)), axis=-1) if load_files: self.inputs = np.array([ preprocess(file, resize=True, rescale=self.rescale) for file in input_files ]) if self.concat is not None: new_inputs = [] for vol in self.inputs: new_inputs.append( np.concatenate((vol, self.concat), axis=-1)) self.inputs = np.array(new_inputs) if seed_files is not None: self.seeds = np.array( [preprocess(file, resize=True) for file in seed_files]) if label_files is not None: self.labels = np.array( [preprocess(file, resize=True) for file in label_files])
def test_factorized(input_file, model, ckpt_dir, scale, cube_size, min_num, postfix=''): # Pre-process cubes, cube_positions, points_numbers = preprocess(input_file, scale, cube_size, min_num) ### Encoding strings, min_v, max_v, shape = compress_factorized(cubes, model, ckpt_dir) # Write files filename = os.path.split(input_file)[-1][:-4] print(filename) rootdir = './compressed'+ postfix +'/' bytes_strings, bytes_pointnums, bytes_cubepos = write_binary_files_factorized( filename, strings.numpy(), points_numbers, cube_positions, min_v.numpy(), max_v.numpy(), shape.numpy(), rootdir) # Read files strings_d, points_numbers_d, cube_positions_d, min_v_d, max_v_d, shape_d = \ read_binary_files_factorized(filename, rootdir) # Decoding cubes_d = decompress_factorized(strings_d, min_v_d, max_v_d, shape_d, model, ckpt_dir) # bpp N = get_points_number(input_file) bpp = round(8*(bytes_strings + bytes_pointnums + bytes_cubepos)/float(N), 4) bpp_strings = round(8*bytes_strings/float(N), 4) bpp_pointsnums = round(8*bytes_pointnums/float(N) ,4) bpp_cubepos = round(8*bytes_cubepos/float(N), 4) bpp_strings_hyper = 0 bpp_strings_head = 0 bpps = [bpp, bpp_strings, bpp_strings_hyper, bpp_strings_head, bpp_pointsnums, bpp_cubepos] return cubes_d, cube_positions_d, points_numbers_d, N, bpps
def e2e(s): # make the input space-delimited in prefix notation ir = postprocess(infix_to_prefix(preprocess(s))) # split on space and turn into nested tuples tup = tuple_for_polish_expression(ir.split(' ')) # convert to MRS and return return prettyUMRSForTuple(tup)
def execute(): global user check_user_var() app = request.forms.app cid = request.forms.cid np = request.forms.np desc = request.forms.desc #priority = request.forms.priority params = {} base_dir = os.path.join(myapps[app].user_dir,user,app,cid) # if preprocess is set run the preprocessor try: if myapps[app].preprocess: run_params,_,_ = myapps[app].read_params(user,cid) processed_inputs = process.preprocess(run_params, myapps[app].preprocess,base_dir) if myapps[app].preprocess == "terra.in": myapps[app].outfn = "out"+run_params['casenum']+".00" except: return template('error',err="There was an error with the preprocessor") # submit job to queue try: params['cid'] = cid params['app'] = app params['user'] = user priority = db(users.user==user).select(users.priority).first().priority jid = sched.qsub(app,cid,user,np,priority,desc) redirect("/case?app="+app+"&cid="+cid+"&jid="+jid) except OSError, e: print >>sys.stderr, "Execution failed:", e params = { 'cid': cid, 'output': pbuffer, 'app': app, 'user': user, 'err': e, 'apps': myapps.keys() } return template('error',params)
def inference(self, img): img_info = {"id": 0} if isinstance(img, str): img_info["file_name"] = os.path.basename(img) img = cv2.imread(img) if img is None: raise ValueError("test image path is invalid!") else: img_info["file_name"] = None height, width = img.shape[:2] img_info["height"] = height img_info["width"] = width img_info["raw_img"] = img img, ratio = preprocess(img, self.test_size, self.rgb_means, self.std) img_info["ratio"] = ratio img = F.expand_dims(mge.tensor(img), 0) t0 = time.time() outputs = self.model(img) outputs = postprocess(outputs, self.num_classes, self.confthre, self.nmsthre) logger.info("Infer time: {:.4f}s".format(time.time() - t0)) return outputs, img_info
def __init__(self, input_files, label_files=None, batch_size=1, seed_type=None, concat_files=None, rotation_range=90., shift_range=0.1, shear_range=0.1, zoom_range=0.1, crop_size=constants.SHAPE, fill_mode='nearest', cval=0., flip=True): self.inputs = np.array( [preprocess(file, rescale=True) for file in input_files]) if label_files is not None: self.labels = np.array([preprocess(file) for file in label_files]) else: self.labels = None self.seed_type = seed_type if concat_files is not None: concat = np.concatenate( (preprocess(concat_files[0]), preprocess(concat_files[1])), axis=-1) new_inputs = [] for vol in self.inputs: new_inputs.append(np.concatenate((vol, concat), axis=-1)) self.inputs = np.array(new_inputs) image_transformer = ImageTransformer(rotation_range=rotation_range, shift_range=shift_range, shear_range=shear_range, zoom_range=zoom_range, crop_size=crop_size, fill_mode=fill_mode, cval=cval, flip=flip) super().__init__(self.inputs, self.labels, image_transformer, batch_size=batch_size)
def process_data(self, filename): """ Load the file and preprocess th data """ self.data = preprocess(filename) self.tablecmd, self.tablespeed, self.tableacc, self.speedsection, self.accsection, self.timesection = process( self.data)
def process_data(self, filename): """ load the file and preprocess th data """ self.data = preprocess(filename) self.tablecmd, self.tablespeed, self.tableacc, self.speedsection, self.accsection, self.timesection = process( self.data)
def __init__(self, dataset_path: str, max_length: int): queries_path = os.path.join(dataset_path, 'train', 'train_data') labels_path = os.path.join(dataset_path, 'train', 'train_label') with open(queries_path, 'rt', encoding='utf8') as f: self.queries = preprocess(f.readlines(), max_length) with open(labels_path) as f: self.labels = np.array([[np.float32(x)] for x in f.readlines()])
def __init__(self, dataset_path: str, vocasize: int, minlen: int, maxlen: int): data_review = path.join(dataset_path, 'train', 'train_data') data_label = path.join(dataset_path, 'train', 'train_label') with open(data_review, 'rt', encoding='utf-8') as f: self.reviews = preprocess(f.readlines(), vocasize, minlen, maxlen) with open(data_label) as f: self.labels = [np.float32(x) for x in f.readlines()]
def infer(raw_data, **kwargs): data = preprocess(raw_data, config.vocasize, config.minlen, config.maxlen) model.eval() prediction = model(data) point = prediction.data.squeeze(dim=1).tolist() # DONOTCHANGE: They are reserved for nsml # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다 return list(zip(np.zeros(len(point)), point))
def test_hyper(input_file, model, ckpt_dir, scale, cube_size, min_num, postfix=''): # Pre-process cubes, cube_positions, points_numbers = preprocess(input_file, scale, cube_size, min_num) ### Encoding y_strings, y_min_vs, y_max_vs, y_shape, z_strings, z_min_v, z_max_v, z_shape, x_ds = compress_hyper( cubes, model, ckpt_dir, True) # Write files filename = os.path.split(input_file)[-1][:-4] print(filename) rootdir = './compressed' + postfix + '/' bytes_strings, bytes_strings_head, bytes_strings_hyper, bytes_pointnums, bytes_cubepos = write_binary_files_hyper( filename, y_strings.numpy(), z_strings.numpy(), points_numbers, cube_positions, y_min_vs.numpy(), y_max_vs.numpy(), y_shape.numpy(), z_min_v.numpy(), z_max_v.numpy(), z_shape.numpy(), rootdir) # Read files y_strings_d, z_strings_d, points_numbers_d, cube_positions_d, y_min_vs_d, y_max_vs_d, y_shape_d, z_min_v_d, z_max_v_d, z_shape_d = \ read_binary_files_hyper(filename, rootdir) # Decoding cubes_d = decompress_hyper(y_strings_d, y_min_vs_d.astype('int32'), y_max_vs_d.astype('int32'), y_shape_d, z_strings_d, z_min_v_d, z_max_v_d, z_shape_d, model, ckpt_dir) # cheat!!! ############## print("decoding error on gpu", "!" * 20, np.max(tf.abs(cubes_d - x_ds).numpy()), "!" * 20) cubes_d = x_ds ############## # bpp N = get_points_number(input_file) bpp = round( 8 * (bytes_strings + bytes_strings_head + bytes_strings_hyper + bytes_pointnums + bytes_cubepos) / float(N), 4) bpp_strings = round(8 * bytes_strings / float(N), 4) bpp_strings_hyper = round(8 * bytes_strings_hyper / float(N), 4) bpp_strings_head = round(8 * bytes_strings_head / float(N), 4) bpp_pointsnums = round(8 * bytes_pointnums / float(N), 4) bpp_cubepos = round(8 * bytes_cubepos / float(N), 4) bpps = [ bpp, bpp_strings, bpp_strings_hyper, bpp_strings_head, bpp_pointsnums, bpp_cubepos ] return cubes_d, cube_positions_d, points_numbers_d, N, bpps
def __init__(self, input_files, label_files=None, concat_files=None, batch_size=1, rotation_range=90., shift_range=0.1, shear_range=0.1, zoom_range=0.1, crop_size=constants.SHAPE, fill_mode='nearest', cval=0., flip=True, label_types=None): self.input_files = input_files self.label_files = label_files self.inputs = [preprocess(file) for file in input_files] if concat_files is not None: concats = [[preprocess(file) for file in channel] for channel in concat_files] self.inputs = np.concatenate((self.inputs, concats*)) if label_files is not None: self.labels = [preprocess(file) for file in label_files] else: self.labels = None self.label_types = label_types image_transformer = ImageTransformer(rotation_range=rotation_range, shift_range=shift_range, shear_range=shear_range, zoom_range=zoom_range, crop_size=crop_size, fill_mode=fill_mode, cval=cval, flip=flip) super().__init__(self.inputs, self.labels, image_transformer, batch_size=batch_size)
def json(): """ Process ads JSON with daily breakdown of channel (optional), ad_id, impressions, engagements, clicks and conversions; return options with suggested status or share for next period """ # Check if JSON contains required data if not 'optimize' in request.json: # pragma: no cover if not 'stats' in request.json: return '"optimize" and "stats" keys missing in posted JSON object' return '"optimize" key missing in posted JSON object' if not 'stats' in request.json: # pragma: no cover return '"stats" key missing in posted JSON object' if not request.json['optimize']: # pragma: no cover if not request.json['stats']: return '"optimize" and "stats" keys are empty' return '"optimize" key is empty' if not request.json['stats']: # pragma: no cover return '"stats" key is empty' weights = { 'impression_weight': 0, 'engagement_weight': 0, 'click_weight': 0, 'conversion_weight': 0 } for metric in request.json['optimize']: weights[metric[:-1] + '_weight'] = None data = pd.DataFrame(request.json['stats']) data = pro.preprocess(data, **weights) data = pro.filter_dates(data, cutoff=CUTOFF) [options, data] = pro.reindex_options(data) bandit = add_daily_results(data, num_options=len(options), memory=True, shape=SHAPE, cutoff=CUTOFF, cut_level=CUT_LEVEL) shares = choose(bandit=bandit, accelerate=True) options = format_results(options, shares) return options.to_json(orient='records')
def __init__(self, frames, input_file_format=None, label_file_format=None, label_types=None, load_files=True, random_gen=False, augment=False, resize=False, tile_inputs=False, batch_size=1, seed=None, ): self.frames = frames self.samples = list(self.frames.keys()) self.load_files = load_files self.random_gen = random_gen self.augment = augment self.resize = resize self.tile_inputs = tile_inputs self.input_file_format = input_file_format self.label_file_format = label_file_format self.input_files = [] self.label_files = None if self.label_file_format is None else [] self.label_types = label_types if not self.random_gen and self.input_file_format is not None: for s in self.frames: for n in self.frames[s]: self.input_files.append(_format(self.input_file_format, s, n)) if self.label_file_format: self.label_files.append(_format(self.label_file_format, s, n)) else: print("[data_utils] either {input,label}_file_format or {input,label}_file_list must be present.") exit(1) self.inputs = self.input_files self.labels = self.label_files if self.load_files: #this loads everything into memory if self.random_gen: raise ValueError(\ 'Input sampling is only supported ' + \ 'if files are not preloaded.') self.inputs = [preprocess(file, resize=self.resize, tile=self.tile_inputs) for file in self.input_files] if self.label_files is not None: self.labels = [preprocess(file, resize=self.resize, tile=self.tile_inputs) for file in self.label_files] if self.tile_inputs: self.inputs = np.reshape(self.inputs, (-1,) + np.asarray(self.inputs).shape[-4:]) if self.label_files is not None: self.labels = np.reshape(self.labels, (-1,) + np.asarray(self.labels).shape[-4:]) elif self.tile_inputs: self.inputs = np.repeat(self.inputs, 8, axis=0) if self.label_files is not None: self.labels = np.repeat(self.labels, 8, axis=0) if self.augment: if self.tile_inputs: raise ValueError('Augmentation not supported if inputs are tiled.') self.image_transformer = ImageTransformer(rotation_range=90., shift_range=0.1, shear_range=0.1, zoom_range=0.1, crop_size=constants.SHAPE, fill_mode='nearest', cval=0, flip=True) super().__init__(max(len(self.samples), len(self.inputs)), batch_size, self.augment, seed)
def predict(): input_data = request.get_json(force=True) transformed_input_data = preprocess(input_data) prediction = model.predict(transformed_input_data) transformed_prediction = postprocess(prediction) return jsonify({"prediction": transformed_prediction})
# In[ ]: from process import preprocess # Get dataset with features # In[ ]: config = { 'pred_var': 'Torvet PM10', # Must include station and pollutants name (column name) 'stations': ['Torvet'], # Stations to use in feature extraction 'window': 6, } data = preprocess(**config) # In[ ]: print('X train', data['X_train'].shape) print('y train', data['y_train'].shape) print('X validation', data['X_val'].shape) print('X test', data['X_test'].shape) #print(data['X_train'].columns) # **Train Multi Output RF | GBM | MLP** # # _Params are hidden inside each file_ # In[ ]:
def csv(): """ Provide form to paste ads CSV with daily breakdown of channel (optional), ad_id, impressions, engagements, clicks and conversions; return options with suggested budget share or status for next period and provide direct upload to Facebook via API """ if request.method == 'POST': if request.form['update'] == 'true': # pragma: no cover app_id = request.form['app_id'] app_secret = request.form['app_secret'] access_token = request.form['access_token'] channels = ast.literal_eval(request.form['channels']) records = ast.literal_eval(request.form['records']) updatable = ['facebook', 'instagram'] indices = [] for channel in updatable: if channel in channels: indices.append(channels.index(channel)) results = pd.DataFrame(columns=['ad_id', 'ad_status']) for index in indices: for record in records[index]: results.loc[len(results)] = \ [record['ad_id'], record['ad_status']] updated = update_facebook(app_id, app_secret, access_token, results) records = updated.to_dict('records') columns = updated.columns.values return render_template('update_result.html', records=records, columns=columns) weights = {} for weight in [ 'impression_weight', 'engagement_weight', 'click_weight', 'conversion_weight' ]: if request.form[weight] == '': weights[weight] = None else: weights[weight] = int(request.form[weight]) data = pd.read_csv(StringIO(request.form['ads']), sep=None, engine='python') try: data = pro.preprocess(data, weights['impression_weight'], weights['engagement_weight'], weights['click_weight'], weights['conversion_weight']) except Exception as error: # pragma: no cover print(error) message = 'Cannot pre-process your data. \ Please check the CSV input format and try again.' return render_template( 'csv.html', error=message, output=request.form['output'], impression_weight=request.form['impression_weight'], engagement_weight=request.form['engagement_weight'], click_weight=request.form['click_weight'], conversion_weight=request.form['conversion_weight'], ads=request.form['ads']) try: data = pro.filter_dates(data, cutoff=CUTOFF) except Exception as error: # pragma: no cover print(error) message = 'Please check your dates (format should be YYYY-MM-DD).' return render_template( 'csv.html', error=message, output=request.form['output'], impression_weight=request.form['impression_weight'], engagement_weight=request.form['engagement_weight'], click_weight=request.form['click_weight'], conversion_weight=request.form['conversion_weight'], ads=request.form['ads']) if data.empty: # pragma: no cover error = 'Please include results with data from the past ' + str( CUTOFF) + ' days.' return render_template( 'csv.html', error=error, output=request.form['output'], impression_weight=request.form['impression_weight'], engagement_weight=request.form['engagement_weight'], click_weight=request.form['click_weight'], conversion_weight=request.form['conversion_weight'], ads=request.form['ads']) [options, data] = pro.reindex_options(data) bandit = add_daily_results(data, num_options=len(options), memory=True, shape=SHAPE, cutoff=CUTOFF, cut_level=CUT_LEVEL) shares = choose(bandit=bandit, accelerate=True) output = request.form['output'] if output == 'status': results = format_results(options, shares, status=True) elif output == 'share': results = format_results(options, shares, status=False).round(2) if 'channel' in options.columns: channel_shares = format_results(options, shares, status=False). \ groupby('channel')['ad_share'].sum().round(2) channels = [] records = [] for name, group in results.groupby('channel'): channels.append(name) group = group.drop(['channel'], axis=1) columns = group.columns.values records.append(group.to_dict('records')) return render_template('csv_result_channels.html', channels=channels, channel_shares=channel_shares, records=records, columns=columns) records = results.to_dict('records') columns = results.columns.values return render_template('csv_result.html', records=records, columns=columns) return render_template('csv.html')
'SmsSid': 'SM994801c6ee52cb08db6affa285661e12', 'FromState': 'AZ', 'SmsStatus': 'received', 'FromCity': 'PHOENIX', 'Body': 'go from Delhi to Gurgaon', 'FromCountry': 'US', 'To': '%2B14804284194', 'ToZip': '85034', 'NumSegments': '1', 'MessageSid': 'SM994801c6ee52cb08db6affa285661e12', 'AccountSid': 'ACffa2ba37390d2cc87d8b52bf6d869c2a', 'From': '%2B16022886791', 'ApiVersion': '2010-04-01' } test_string = response['Body'] message_body = preprocess(test_string) query_type = find_query_type(message_body) # print(query_type) ans = "" if query_type == 1: source, destination = find_src_dest(message_body) print(source, destination) ans = route1(source, destination) else: # source, destination = find_src_dest(message_body) message_body = message_body.lower() near_me, query = near_locs(message_body) source = find_src(query) print('near me: ', near_me) print('source:', source)
from time import time from process import preprocess import numpy from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectPercentile, f_classif features_train_vect,features_train, features_test, labels_train, labels_test = preprocess() from sklearn.naive_bayes import GaussianNB clf = GaussianNB() t0 = time() clf.fit(features_train, labels_train) print "training time:", round(time()-t0, 3), "s" import collections t0 = time() pred = clf.predict(features_test) print "prediction time:", round(time()-t0, 3), "s" from sklearn.metrics import accuracy_score accuracy = accuracy_score(pred , labels_test) print accuracy from bs4 import BeautifulSoup from newspaper import Article urls = ['http://www.newsmax.com/Politics/putin-tv-trump-dangerous/2017/04/17/id/784706/', 'http://www.hollywoodreporter.com/heat-vision/star-wars-rare-archival-footage-shown-at-celebration-had-funny-new-hope-f-bomb-994552?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thr%2Ffilm+%28The+Hollywood+Reporter+-+Movies%29&utm_content=FeedBurner', 'http://www.espn.com/sports/endurance/story/_/id/19177433/boston-marathon-2017-devin-wang-another-year-brings-closure-tragedy']
def lambda_handler(event, context): to_number = "+919079945319" from_number = "+13345131650" test_string = event['Body'] message_body = preprocess(test_string) query_type = find_query_type(message_body) # print(query_type) ans = "" if query_type == 1: source, destination = find_src_dest(message_body) ans += "\nDirections \n" ans += "From :- " + source + "\n" ans += "To :- " + destination + "\n" ans += route1(source, destination) else: # source, destination = find_src_dest(message_body) message_body = message_body.lower() near_me, query = near_locs(message_body) source = find_src(query) ans += "\nFollowing results were fetched near your current location :- \n" ans += route2(source, near_me) # location_from, location_to = find_src_dest(message_body) # response_from = requests.get( # "https://nominatim.openstreetmap.org/?addressdetails=1&q=" + location_from + "&format=json&limit=1") # response_to = requests.get( # "https://nominatim.openstreetmap.org/?addressdetails=1&q=" + location_to + "&format=json&limit=1") # lat_from = response_from.json()[0]['lat'] # lon_from = response_from.json()[0]['lon'] # query_type = 0 # ans = "" # near_me = "hospital" # if query_type == 0: # lat_to = response_to.json()[0]['lat'] # lon_to = response_to.json()[0]['lon'] # response_route = requests.get( # 'http://www.mapquestapi.com/directions/v2/route?key=j1IVnoFZUzzkteLml8NKw1wjF5x5mGK3&from=' + lat_from + ',' + lon_from + '&to=' + lat_to + ',' + lon_to) # print("Start Point:", location_from, lat_from, lon_from) # print("End Point:", location_to, lat_to, lon_to) # direction = ["none", "north", "northwest", "northeast", "south", "southeast", "southwest", "west", "east"] # turnType = ["straight", "slight right", "right", "sharp right", "reverse", "sharp left", "left", "slight left", # "right u-turn", "left u-turn", "right merge", "left merge", "right on ramp", "left on ramp", # "right off ramp", "left off ramp", "right fork", "left fork", "straight fork"] # for obb in response_route.json()["route"]["legs"][0]["maneuvers"]: # s1 = "Take " + turnType[obb["turnType"]] + " and go " + str(int(obb["distance"] * 1609)) + " meters, in " + \ # direction[obb["direction"]] + " direction" # s2 = obb["narrative"] # ans += s1 + '\n' + s2 + '\n\n' # # print(s1) # # print(s2, end="\n\n") # elif query_type == 1: # r = requests.get( # 'http://open.mapquestapi.com/nominatim/v1/search.php?key=j1IVnoFZUzzkteLml8NKw1wjF5x5mGK3&format=json&q=' + lat_from + ',' + lon_from + '+[' + near_me + ']&addressdetails=1&limit=20') # print(r.json()) # for obb in r.json(): # ans += obb["display_name"] + '\n' # # print(i["display_name"]) # ans = ans.strip() # print(ans) body = ans print(body) print(event) if not TWILIO_ACCOUNT_SID: return "Unable to access Twilio Account SID." elif not TWILIO_AUTH_TOKEN: return "Unable to access Twilio Auth Token." elif not to_number: return "The function needs a 'To' number in the format +12023351493" elif not from_number: return "The function needs a 'From' number in the format +19732644156" elif not body: return "The function needs a 'Body' message to send." # insert Twilio Account SID into the REST API URL populated_url = TWILIO_SMS_URL.format(TWILIO_ACCOUNT_SID) post_params = {"To": to_number, "From": from_number, "Body": body} # encode the parameters for Python's urllib data = parse.urlencode(post_params).encode() req = request.Request(populated_url) # add authentication header to request based on Account SID + Auth Token authentication = "{}:{}".format(TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN) base64string = base64.b64encode(authentication.encode('ascii')) req.add_header("Authorization", "Basic %s" % base64string.decode('ascii')) try: # perform HTTP POST request with request.urlopen(req, data) as f: print("Twilio returned {}".format(str(f.read().decode('utf-8')))) except Exception as e: print("something went wrong!") return e return ''
import string import collections from collections import Counter from nltk.corpus import stopwords import vincent #Stop Words punctuation = list(string.punctuation) stop = stopwords.words('english') + punctuation + ['RT', 'via','de','o'] with open('data/stream_twitterarthursun.json', 'r') as f: count_all = Counter() for line in f: tweet = json.loads(line) # Create a list with all the terms terms_all = [term for term in process.preprocess(tweet['text']) if term not in stop] #terms_hash = [term for term in process.preprocess(tweet['text']) # if term.startswith('#')] #terms_single = set(terms_all) #terms_only = [term for term in process.preprocess(tweet['text']) # if term not in stop and # not term.startswith(('#', '@'))] #Update the counter count_all.update(terms_all) #count_all.update(terms_hash) #count_all.update(terms_only) #count_all.update(terms_single)
def read_class_name(path): df_thmia = pd.read_csv(path, header=None, sep="\t") name2idx = { name: i for i, name in enumerate(list(df_thmia.values.reshape(-1))) } idx2name = {idx: name for name, idx in name2idx.items()} return name2idx, idx2name if __name__ == '__main__': from process import preprocess preprocess() name2idx, idx2name = read_class_name(config.arrythmia) x_test = pd.read_csv(r"../user_data/x_test.csv").iloc[:, 1:] x_test['age'].fillna(42.627019408001736, inplace=True) print(x_test) clf_list = load_model("model.ml") test = pd.read_csv(config.test_label, sep='\t', header=None, dtype=str) p = [[] for i in range(test.iloc[:, 0].size)] for i in tqdm(range(34)): print(str(i) + ":" + idx2name[i]) clf = clf_list[i] p_test = clf.predict(x_test) for j in range(len(p_test)): if p_test[j] == 1:
def main(): LOGGER_LEVEL = 10 RAW_DATA_PATH = './data/raw/' RAW_CSV_NAME = 'raw_data.csv' t0 = time.time() logger = config.config_logger(__name__, LOGGER_LEVEL) pd.set_option('display.float_format', lambda x: '{0:.2f}'.format(x)) logger.info('Beginning execution: zika dataset') logger.info('Logger configured - level {0}'.format(LOGGER_LEVEL)) logger.info('Opening CSV: {0}{1}'.format(RAW_DATA_PATH, RAW_CSV_NAME)) raw_data = pd.read_csv(RAW_DATA_PATH + RAW_CSV_NAME) logger.info('Raw dataset description:') process.basic_descriptives(raw_data) raw_data = process.preprocess(raw_data) #print(raw_data.describe().transpose().to_string()) #print(raw_data.head().to_string()) #print(raw_data.info().to_string()) y_dengue = raw_data['dengue_pcr'] y_zika = raw_data['zika_pcr'] y_chik = raw_data['chik_pcr'] diseases = [y_dengue, y_zika, y_chik] # Check process code for further explanation of select_disease function. # code: 1. Dengue, 2. Zika, 3. Chik, 4. Any # only_one: if True, input np.nan to patients with another disease. y = process.select_disease(diseases, code=1, only_one=False) logger.info('Target var frequency: \n{0}'.format(y.value_counts())) logger.info('Total obs: {0}'.format(y.value_counts().sum())) remove_list = ['id', 'centro_pob', 'name', 'dep', 'prov', 'dist', 'serotipo1', 'serotipo2', 'serotipo3', 'serotipo4', 'dengue_pcr', 'zika_pcr', 'chik_pcr'] X = process.remove_vars(raw_data, remove_list) X = process.keep_non_nan(X, y) y = y.dropna() logger.info('Features dataset') process.basic_descriptives(X) logger.info('Split train test') X_train, X_test, y_train, y_test = models.split_data(X, y, proportion=0.4) logger.info('Estimating models') logger.info('GBM') grid_gbm = models.gbm_grid(X_train, y_train, n_cv=5) logger.info(grid_gbm.best_params_) logger.info('Train score: {0}'.format(grid_gbm.best_score_)) logger.info('Test score: {0}'.format(grid_gbm.score(X_test, y_test))) logger.info('Logit') grid_logit = models.logit_grid(X_train, y_train, n_cv=5) logger.info(grid_logit.best_params_) logger.info('Train score: {0}'.format(grid_logit.best_score_)) logger.info('Test score: {0}'.format(grid_logit.score(X_test, y_test))) logger.info('AdaBoost') grid_adaboost = models.adaboost_grid(X_train, y_train, n_cv=5) logger.info(grid_adaboost.best_params_) logger.info('Train score: {0}'.format(grid_adaboost.best_score_)) logger.info('Test score: {0}'.format(grid_adaboost.score(X_test, y_test))) logger.info('Soft Voting') eclf = VotingClassifier(estimators=[('gbm', grid_gbm), ('logit', grid_logit), ('ada', grid_adaboost)], voting='soft') eclf.fit(X_train, y_train) y_pred = eclf.predict_proba(X_test) print(y_pred[:5,:]) logger.info('Train score: {0}'.format(eclf.score(X_train, y_train))) logger.info('Test score: {0}'.format(eclf.score(X_test, y_test))) config.time_taken_display(t0)
python test.py decompress "compressed/longdress_vox10_1300" \ --ckpt_dir="checkpoints/hyper/a6b3/" """ args = parse_args() config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 1.0 config.gpu_options.allow_growth = True config.log_device_placement=True sess = tf.Session(config=config) model = importlib.import_module(args.modelname) if args.mode == "factorized": if args.command == "compress": cubes, cube_positions, points_numbers = preprocess(args.input, args.scale, args.cube_size, args.min_num) strings, min_v, max_v, shape = compress_factorized(cubes, model, args.ckpt_dir) if not args.output: args.output = os.path.split(args.input)[-1][:-4] rootdir = './compressed' else: rootdir, args.output = os.path.split(args.output) bytes_strings, bytes_pointnums, bytes_cubepos = write_binary_files_factorized( args.output, strings.numpy(), points_numbers, cube_positions, min_v.numpy(), max_v.numpy(), shape.numpy(), rootdir=rootdir) elif args.command == "decompress": rootdir, filename = os.path.split(args.input) if not args.output: args.output = filename + "_rec.ply" strings_d, points_numbers_d, cube_positions_d, min_v_d, max_v_d, shape_d = read_binary_files_factorized(filename, rootdir) cubes_d = decompress_factorized(strings_d, min_v_d, max_v_d, shape_d, model, args.ckpt_dir)
import nltk # nltk downloads nltk.download('stopwords') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') # loading the dataset print("Loading the dataset .......") reviews_df = pd.read_csv("../dataset/AllProductReviews.csv") # removing neutral reviews reviews_df = reviews_df[reviews_df["ReviewStar"] != 3] # preprocessing the reviews reviews_df = process.preprocess(reviews_df) # word stemming with Part of speech tagging reviews_df = process.word_stemming(reviews_df) # spliting the dataset in training, cross validation set and testing set train_df, test_df = train_test_split(reviews_df, test_size=0.4, shuffle=True) # extracting the features from the text data aspects, values = feature_processing.feature_extraction(train_df) # creating the feature vectors feature_vectors, y = feature_processing.create_feature_vector( train_df, aspects, values) feature_vectors_test, y_test = feature_processing.create_feature_vector( test_df, aspects, values)
""" import sys import process import random_forest import validator import visualizations import save import pandas as pd from openpyxl import load_workbook # Create data structures that will be used in the analysis df, df1, header, canc, targ, data, classes, orig_data, orig_classes, excl_targ, freq = process.preprocess( datapath='./../data/median/', fil=sys.argv[1], targ=sys.argv[2], exclude=sys.argv[3]) # Random Forest Classifier, prediction, and hold out accuracy rfc, rfc_pred, mean_acc = random_forest.random_forest(canc, targ, data, classes, orig_data, orig_classes) # Model performance and statistical measures. THIS FUNCTION IS ALSO NECESSARY TO GENERATE THE FIGURES. cm, pvalue, zscore, cv_score, summary = validator.summary_statistics( rfc, rfc_pred, data, classes, orig_classes, orig_data, targ, excl_targ, mean_acc, canc) # Model comparison with Auslander et al., 2016. Use only gene expression in these predictions for a true comparison. #df2 = df1.copy(deep=True)
def pushSummarys(self, tweet): if ('delete' not in tweet) and (tweet['lang'] == 'en'): if 'retweeted_status' in tweet: tem = tweet['retweeted_status'] tem['timestamp_ms'] = tweet['timestamp_ms'] tem['created_at'] = tweet['created_at'] tweet = tem # t1 = T.time() delta = self.time.calculatetime(tweet['created_at']) if delta >= 1: for x in range(self.L): self.numofdayA[x] = 0 self.numofdayB[x] = 0 stemwords_interest_profile = self.interest_files[x] listofsummaryA = [summary[0] for summary in self.summaryA[x] if summary[1] == self.day] if len(listofsummaryA) > 0: self.tfidfthresholdA[x] = min(summaryA[2] for summaryA in listofsummaryA) # # self.jsdthresholdA[x] = min(summaryA[3] for summaryA in listofsummaryA) del listofsummaryA listofsummaryB = [summary[0] for summary in self.summaryB[x] if summary[1] == self.day] if len(listofsummaryB) > 0: self.tfidfthresholdB[x] = min(summaryB[2] for summaryB in listofsummaryB) # self.jsdthresholdB[x] = min(summaryB[3] for summaryB in listofsummaryB) sumoflen = sum(summaryBBBB[5] for summaryBBBB in listofsummaryB) ADL = sumoflen / len(listofsummaryB) lenofq = len(stemwords_interest_profile) result = [] for summaryBBB in listofsummaryB: score = 0 TF = summaryBBB[4] for q in stemwords_interest_profile: tf = TF[q] avgtf = sum(TF[qq] for qq in stemwords_interest_profile) / len(TF) RITF = math.log2(1 + tf) / math.log2(1 + avgtf) LRTF = tf * math.log2(1 + ADL / summaryBBB[5]) + 0.0001 w = 2 / (1 + math.log2(1 + lenofq)) TFF = w * RITF / (1 + RITF) + (1 - w) * LRTF / (1 + LRTF) IDF = math.log((len(listofsummaryB) + 1) / (self.qoccur[x][q] + 1)) + 0.0001 AEF = self.numofq[x][q] / (self.qoccur[x][q] + 1) TDF = IDF * AEF / (1 + AEF) sim = TFF * TDF score += sim del tf, avgtf, RITF, LRTF, w, TFF, IDF, AEF, TDF, sim # score += summaryBBB[3] result.append([score, summaryBBB[1]]) del listofsummaryB result.sort(key=operator.itemgetter(0), reverse=True) j = 1 day = str(self.day + 1) # d = '201507' + day for i in result: if (self.day) >= 9: d = '201608' + day else: d = '2016080' + day with open('B.txt', 'a') as ff: ff.write( '%s %s Q0 %s %s %s CCNUNLPrun1\n' % (d, self.topicid[x], i[1], str(j), i[0])) j = j + 1 self.time.settime() self.day = self.day + 1 content = tweet['text'] stemwords_tweet = preprocess(content) del content wordInTweet = {} if stemwords_tweet == False: pass else: numOfWordAtweet = len(stemwords_tweet) self.SumOfLenthOfStream = numOfWordAtweet + self.SumOfLenthOfStream id_str = tweet['id_str'] for word in stemwords_tweet: if word in self.wordInStream: self.wordInStream[word] += 1 else: self.wordInStream[word] = 1 if word in wordInTweet: wordInTweet[word] += 1 else: wordInTweet[word] = 1 for x in range(self.L): stemwords_interest_profile = self.interest_files[x] count = sum(stemwords_tweet.count(wordsss) for wordsss in stemwords_interest_profile) # print(count) if count >= 2: sumoftfidf = 0.0 del count for word in stemwords_tweet: if word in self.queries_word[x]: self.queries_word[x][word] += 1 else: self.queries_word[x][word] = 1 for word in set(stemwords_tweet): if word not in self.queries_occur[x]: self.queries_occur[x][word] = 1 else: self.queries_occur[x][word] += 1 self.queries_numOfWord[x] += numOfWordAtweet self.queries_numOfTweet[x] += 1 for word in stemwords_tweet: tf = self.queries_word[x][word] / self.queries_numOfWord[x] idf = math.log2((self.queries_numOfTweet[x] + 1) / self.queries_occur[x][word]) sumoftfidf = sumoftfidf + tf * idf if sumoftfidf >= self.tfidfthresholdA[x] and self.numofdayA[x] < 10: listofsummaryA = [summary[0] for summary in self.summaryA[x]] if len(listofsummaryA) > 0: jsd = [] for summary in listofsummaryA: sumofjsd = 0 tf = {} for wordss in summary[0]: if wordss in tf: tf[wordss] += 1 else: tf[wordss] = 1 sameword = [word for word in stemwords_tweet if word in summary[0]] if len(sameword) > 0: for word in sameword: Pti = float(wordInTweet[word]) / float(numOfWordAtweet) Psi = float(self.wordInStream[word]) / float(self.SumOfLenthOfStream) thetaTi = self.lemda * Pti + (1 - self.lemda) * Psi Ptj = float(tf[word]) / float(len(summary[0])) Psj = float(self.wordInStream[word]) / float(self.SumOfLenthOfStream) thetaTj = self.lemda * Ptj + (1 - self.lemda) * Psj # sumofjsd += thetaTi * math.log(thetaTj/thetaTj) M = (thetaTi + thetaTj) / 2 sumofjsd += (0.5 * (thetaTi * math.log(thetaTi / M)) + 0.5 * ( thetaTj * math.log(thetaTj / M))) jsd.append(sumofjsd) else: jsd.append(0.07) JSD = min(jsd) else: JSD = 0.05 # print(JSD) if JSD >= self.jsdthresholdA[x]: # print(self.topicid[x]+str(type(self.topicid[x]))) # print(id_str+str(type(id_str))) #self.rest.Post(self.topicid[x], id_str) self.jsdthresholdA[x]=JSD self.numofdayA[x] += 1 a = [stemwords_tweet, id_str, sumoftfidf, JSD] self.summaryA[x].append([a, self.day]) self.fa.write('%s %s tfidf:%s jsd:%s\n' % (self.day, self.topicid[x], sumoftfidf, JSD)) if sumoftfidf >= self.tfidfthresholdB[x] and self.numofdayB[x] < 100: listofsummaryB = [summary[0] for summary in self.summaryB[x]] if len(listofsummaryB) > 0: jsd = [] for summary in listofsummaryB: sumofjsd = 0 sameword = [word for word in stemwords_tweet if word in summary[0]] tf = {} for wordss in summary[0]: if wordss in tf: tf[wordss] += 1 else: tf[wordss] = 1 if len(sameword) > 0: for word in sameword: Pti = float(wordInTweet[word]) / float(numOfWordAtweet) Psi = float(self.wordInStream[word]) / float(self.SumOfLenthOfStream) thetaTi = self.lemda * Pti + (1 - self.lemda) * Psi Ptj = float(tf[word]) / float(len(summary[0])) Psj = float(self.wordInStream[word]) / float(self.SumOfLenthOfStream) thetaTj = self.lemda * Ptj + (1 - self.lemda) * Psj # sumofjsd += thetaTi * math.log(thetaTi/thetaTj) M = float(thetaTi + thetaTj) / 2 sumofjsd += 0.5 * (thetaTi * math.log(thetaTi / M)) + 0.5 * ( thetaTj * math.log(thetaTj / M)) jsd.append(sumofjsd) else: jsd.append(0.07) JSD = min(jsd) else: JSD = 0.05 # print(JSD) if JSD >= self.jsdthresholdB[x]: self.numofdayB[x] += 1 lenoflistB = len(listofsummaryB) self.jsdthresholdB[x] = (lenoflistB * self.jsdthresholdB[x] + JSD) / (lenoflistB + 1) TF = {} for q in stemwords_interest_profile: TF[q] = stemwords_tweet.count(q) if q in stemwords_tweet: if q in self.qoccur[x]: self.qoccur[x][q] += 1 else: self.qoccur[x][q] = 1 else: self.qoccur[x][q] = 0 if q in self.numofq[x]: self.numofq[x][q] += stemwords_tweet.count(q) else: self.numofq[x][q] = stemwords_tweet.count(q) b = [stemwords_tweet, id_str, sumoftfidf, JSD, TF, numOfWordAtweet] self.summaryB[x].append([b, self.day]) pass