def classify_lightcurves(): dataFileNamesR = [] dataFileNamesG = [] fileDirectoryR = lightcurve_path + 'Rfilter/' fileDirectoryG = lightcurve_path + 'Gfilter/' fileDirectoryClassifiedR = lightcurve_path + 'classified/Rfilter/' fileDirectoryClassifiedG = lightcurve_path + 'classified/Gfilter/' for f in os.listdir(fileDirectoryR): if f.endswith('.dat'): dataFileNamesR.append(f) for f in os.listdir(fileDirectoryG): if f.endswith('.dat'): dataFileNamesG.append(f) dataFileNamesR = natsorted(dataFileNamesR) dataFileNamesG = natsorted(dataFileNamesG) rf, pca = models.create_models(features_path + 'all_features.txt', features_path + 'pca_features.txt') class_results = [] for filename in dataFileNamesR: data = np.loadtxt(fileDirectoryR + filename, usecols=np.arange(0, 3)) mjd = [float(i) for i in data[:, 0]] mag = [float(i) for i in data[:, 1]] magerr = [float(i) for i in data[:, 2]] sosort = np.array([mjd, mag, magerr]).T sosort = sosort[sosort[:, 0].argsort(), ] mjd = sosort[:, 0] mag = sosort[:, 1] magerr = sosort[:, 2] + 0.0001 prediction, ml_pred = microlensing_classifier.predict( mag, magerr, rf, pca)[0:2] #print('filename: ', filename, 'prediction: ', prediction, 'ml_pred = ', ml_pred) result = [filename, prediction, ml_pred] class_results.append(result) ml_pred = str(ml_pred) ml_pred = ml_pred.replace("[", "") ml_pred = ml_pred.replace("]", "") location = fileDirectoryClassifiedR + ml_pred + str(filename) np.savetxt(location, data, fmt='%s') for filename in dataFileNamesG: data = np.loadtxt(fileDirectoryG + filename, usecols=np.arange(0, 3)) mjd = [float(i) for i in data[:, 0]] mag = [float(i) for i in data[:, 1]] magerr = [float(i) for i in data[:, 2]] sosort = np.array([mjd, mag, magerr]).T sosort = sosort[sosort[:, 0].argsort(), ] mjd = sosort[:, 0] mag = sosort[:, 1] magerr = sosort[:, 2] try: prediction, ml_pred = microlensing_classifier.predict( mag, magerr, rf, pca)[0:2] except (ValueError): prediction, ml_pred = "NA", "NA" #print('filename: ', filename, 'prediction: ', prediction, 'ml_pred = ', ml_pred) result = [filename, prediction, ml_pred] class_results.append(result) ml_pred = str(ml_pred) ml_pred = ml_pred.replace("[", "") ml_pred = ml_pred.replace("]", "") location = fileDirectoryClassifiedG + ml_pred + str(filename) np.savetxt(location, data, fmt='%s') targetname = str(filename).replace("R.dat", "") targetname = targetname.replace("G.dat", "") print(targetname) #target = Target.objects.get(name=targetname) #target.save(extras={'Microlensing probability': {'probability': ml_pred, 'timestamp': datetime.datetime.now()}}) return class_results
def mulens( fid, magpsf, sigmapsf, magnr, sigmagnr, magzpsci, isdiffpos, ndethist): """ Returns the predicted class (among microlensing, variable star, cataclysmic event, and constant event) & probability of an alert to be a microlensing event in each band using a Random Forest Classifier. Parameters ---------- fid: Spark DataFrame Column Filter IDs (int) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error magnr, sigmagnr: Spark DataFrame Columns Magnitude of nearest source in reference image PSF-catalog within 30 arcsec and 1-sigma error magzpsci: Spark DataFrame Column Magnitude zero point for photometry estimates isdiffpos: Spark DataFrame Column t => candidate is from positive (sci minus ref) subtraction f => candidate is from negative (ref minus sci) subtraction Returns ---------- out: list Returns the mean of the probabilities (one probability per band) if the event was considered as microlensing in both bands, otherwise 0.0. Examples --------- >>> from fink_science.utilities import concat_col >>> from pyspark.sql import functions as F >>> df = spark.read.load(ztf_alert_sample) # Required alert columns >>> what = [ ... 'fid', 'magpsf', 'sigmapsf', ... 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) >>> args = [F.col(i) for i in what_prefix] >>> args += ['candidate.ndethist'] >>> df = df.withColumn('new_mulens', mulens(*args)) # Drop temp columns >>> df = df.drop(*what_prefix) >>> df.filter(df['new_mulens'] > 0.0).count() 0 """ warnings.filterwarnings('ignore') # broadcast models curdir = os.path.dirname(os.path.abspath(__file__)) model_path = curdir + '/data/models/' rf, pca = load_external_model(model_path) valid_index = np.arange(len(magpsf), dtype=int) # At most 100 measurements in each band mask = (ndethist.astype(int) < 100) # At least 10 measurements in each band mask *= magpsf.apply(lambda x: np.sum(np.array(x) == np.array(x))) >= 20 to_return = np.zeros(len(magpsf), dtype=float) for index in valid_index[mask.values]: # Select only valid measurements (not upper limits) maskNotNone = np.array(magpsf.values[index]) == np.array(magpsf.values[index]) classes = [] probs = [] for filt in [1, 2]: maskFilter = np.array(fid.values[index]) == filt m = maskNotNone * maskFilter # Reject if less than 10 measurements if np.sum(m) < 10: classes.append('') continue # Compute DC mag mag, err = np.array([ dc_mag(i[0], i[1], i[2], i[3], i[4], i[5], i[6]) for i in zip( np.array(fid.values[index])[m], np.array(magpsf.values[index])[m], np.array(sigmapsf.values[index])[m], np.array(magnr.values[index])[m], np.array(sigmagnr.values[index])[m], np.array(magzpsci.values[index])[m], np.array(isdiffpos.values[index])[m]) ]).T # Run the classifier output = microlensing_classifier.predict(mag, err, rf, pca) # Update the results # Beware, in the branch FINK the order has changed # classification,p_cons,p_CV,p_ML,p_var = microlensing_classifier.predict() classes.append(str(output[0])) probs.append(float(output[3][0])) # Append mean of classification if ML favoured, otherwise 0 if np.all(np.array(classes) == 'ML'): to_return[index] = np.mean(probs) else: to_return[index] = 0.0 return pd.Series(to_return)
def mulens(fid, magpsf, sigmapsf, magnr, sigmagnr, magzpsci, isdiffpos, rf, pca): """ Returns the predicted class (among microlensing, variable star, cataclysmic event, and constant event) & probability of an alert to be a microlensing event in each band using a Random Forest Classifier. Parameters ---------- fid: Spark DataFrame Column Filter IDs (int) magpsf, sigmapsf: Spark DataFrame Columns Magnitude from PSF-fit photometry, and 1-sigma error magnr, sigmagnr: Spark DataFrame Columns Magnitude of nearest source in reference image PSF-catalog within 30 arcsec and 1-sigma error magzpsci: Spark DataFrame Column Magnitude zero point for photometry estimates isdiffpos: Spark DataFrame Column t => candidate is from positive (sci minus ref) subtraction f => candidate is from negative (ref minus sci) subtraction rf: RandomForestClassifier sklearn.ensemble._forest.RandomForestClassifier pca: PCA sklearn.decomposition._pca.PCA Returns ---------- out: list Returns the class (string) and microlensing score (double) ordered as [class_band_1, ml_score_band1, class_band_2, ml_score_band2] Examples --------- >>> from fink_science.utilities import concat_col >>> from pyspark.sql import functions as F # wrapper to pass broadcasted values >>> def mulens_wrapper(fid, magpsf, sigmapsf, magnr, sigmagnr, magzpsci, isdiffpos): ... return mulens(fid, magpsf, sigmapsf, magnr, sigmagnr, magzpsci, isdiffpos, rfbcast.value, pcabcast.value) >>> df = spark.read.load(ztf_alert_sample) >>> schema = load_mulens_schema_twobands() # Required alert columns >>> what = [ ... 'fid', 'magpsf', 'sigmapsf', ... 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos'] # Use for creating temp name >>> prefix = 'c' >>> what_prefix = [prefix + i for i in what] # Append temp columns with historical + current measurements >>> for colname in what: ... df = concat_col(df, colname, prefix=prefix) >>> curdir = os.path.dirname(os.path.abspath(__file__)) >>> model_path = curdir + '/../data/models/' >>> rf, pca = load_external_model(model_path) >>> rfbcast = spark.sparkContext.broadcast(rf) >>> pcabcast = spark.sparkContext.broadcast(pca) >>> t = udf(mulens_wrapper, schema) >>> args = [col(i) for i in what_prefix] >>> df_mulens = df.withColumn('mulens', t(*args)) # Drop temp columns >>> df_mulens = df_mulens.drop(*what_prefix) >>> df_mulens.agg({"mulens.ml_score_1": "min"}).collect()[0][0] 0.0 >>> df_mulens.agg({"mulens.ml_score_1": "max"}).collect()[0][0] < 1.0 True """ warnings.filterwarnings('ignore') # Select only valid measurements (not upper limits) maskNotNone = np.array(magpsf) != None out = [] for filt in [1, 2]: maskFilter = np.array(fid) == filt m = maskNotNone * maskFilter # Reject if less than 10 measurements if np.sum(m) < 10: out.extend(['', 0.0]) continue # Compute DC mag mag, err = np.array([ dc_mag(i[0], i[1], i[2], i[3], i[4], i[5], i[6]) for i in zip( np.array(fid)[m], np.array(magpsf)[m], np.array(sigmapsf)[m], np.array(magnr)[m], np.array(sigmagnr)[m], np.array(magzpsci)[m], np.array(isdiffpos)[m]) ]).T # Run the classifier output = microlensing_classifier.predict(mag, err, rf, pca) # Update the results out.extend([str(output[0]), float(output[1][0])]) return out
def test_probability_prediction(value): pred = microlensing_classifier.predict(mag, magerr, rf, pca)[1] value.assertTrue( pred >= 0.4 and pred <= 0.6, "Classifier failed, probability prediction not within range.")
def test_predict(value): value.assertEqual( microlensing_classifier.predict(mag, magerr, rf, pca)[0], 'ML', "Classifier failed, predicted class is not correct.")
These lc should have be some ML events, including OB190011 for lc in [207194,132119,177748,177461,78283,121424,174610,215315]: #randi = np.arange(30000,200000,1) for lc in randi: lc = int(lc) print(lc) try: time = hdf_files['dataset_photometry'][lc][:,9] mag = hdf_files['dataset_photometry'][lc][:,11] emag = hdf_files['dataset_photometry'][lc][:,12] back = hdf_files['dataset_photometry'][lc][:,-2] ppscale = hdf_files['dataset_photometry'][lc][:,-4] mask = (time>1) & (mag>1) & (np.abs(ppscale-exptime[ind_ref]/exptime)<0.2) & (np.abs(back)<250) order = time[mask].argsort() if np.median(mag[mask])<30: classification= microlensing_classifier.predict(time[mask][order],mag[mask][order],emag[mask][order], model) if float(classification[3][1])>0.6: plt.scatter(time[mask],mag[mask]) plt.gca().invert_yaxis() plt.show() import pdb; pdb.set_trace() except: pass import pdb; pdb.set_trace()