def clustering(lats, longs, timestamps, ID, timestmp, multiPDF=False): """ Clusters the GPS coordinates using DBSCAN :param timestmp: The timestamp :param ID: The ID :param timestamps: The timestamps of the GPS coordinates :param lats: The latitudes :param longs: The longitudes :return: The rounded distance """ folder = "out/" plotDir = folder + "plots/Walking Test Analysis" R = 6371 # Radius of the earth in km cartesianX = [] cartesianY = [] cartesianZ = [] for lat, long in zip(lats, longs): # Convert to cartesian coordinates x = R * cos(lat) * cos(long) y = R * cos(lat) * sin(long) z = R * sin(lat) cartesianX.append(x) cartesianY.append(y) cartesianZ.append(z) combined = np.vstack((cartesianX, cartesianY, cartesianZ)).T (core_samples, labels) = dbscan(combined, eps=0.5) grouped = zip(labels, core_samples) nonGroupedPositions = [] for (label, core_sample) in grouped: if label != -1: lat = lats[core_sample] long = longs[core_sample] stamp = timestamps[core_sample] nonGroupedPositions.append((lat, long, stamp)) if len(nonGroupedPositions) > 0: y = zip(*nonGroupedPositions)[0] # the latitudes x = zip(*nonGroupedPositions)[1] # the longitudes t = zip(*nonGroupedPositions)[2] # the timestamps x2, y2, newx2, newy2 = smooth(y, x, t) plt.plot(y2, x2, label="Linear Interpolation") plt.plot(newy2, newx2, label="Savgol Filter", color="r") distance = calcDistanceWalked(newy2, newx2) grouped = sorted(grouped, key=itemgetter(0)) clusters = {} labels = [] for key, group in groupby(grouped, key=itemgetter(0)): # group the clusters based on their label labels.append(key) clusters[key] = [el[1] for el in group] noise = False colors = plt.get_cmap("Spectral")(np.linspace(0, 1, len(clusters))) for label in labels: indices = clusters[label] latitudes = [] longitudes = [] size = 10 alpha = 0.5 lineWidth = 0.15 for i in indices: latitudes.append(lats[i]) longitudes.append(longs[i]) if label == -1: # outliers are identified with a label of -1 plt.plot(latitudes, longitudes, "o", markerfacecolor=almost_black, markeredgecolor=almost_black, markersize=size, alpha=alpha, linewidth=lineWidth, label="Outlier") noise = True else: plt.plot(latitudes, longitudes, "o", markerfacecolor=colors[label], markeredgecolor=almost_black, markersize=size, alpha=alpha, linewidth=lineWidth, label="Cluster %i" % (label + 1)) plt.title("Timestamp: %s\n Number of clusters: %i\n Calculated distance: %i meters" % ( timestmp, (len(clusters) - 1) if noise else len(clusters), round(distance))) plt.xlabel("Latitude") plt.ylabel("Longitude") fancyPlot() writeToPdf(ID, plotDir) return True, distance else: # DBSCAN gave back an empty array, therefore we cannot perform any smoothing or distance calculation return False, 0
def sleepAnalysis(): """ Analysis of the sleep duration of our participants per day """ csvDir = "out/csv/Sleep Analysis" plotDir = "out/plots/Sleep Analysis" measurementDictionary = OrderedDict() try: os.makedirs(csvDir) except OSError: # path already exists pass for (ID, pid, device) in basisPeak + fitBit: measurements = None if device == "basispeak": measurements = db.measurements.find({"pid": pid, "mtype": 7, "date": {"$gte": start, "$lt": end}}).sort( [("date", 1)]) measurements = list(measurements) elif device == "fitbit": measurements = db.diaFitBit.find({"pid": pid, "mtype": 7, "date": {"$gte": start, "$lt": end}}).sort( [("date", 1)]) measurements = list(measurements) elif device == "microsoftband": measurements = [] keys = [] if device == "basispeak": for key, group in groupby(measurements, lambda x: x["end"].strftime("%y-%m-%d")): # group the measurements based on date keys.append(key) measurementDictionary[key] = [el for el in group] elif device == "fitbit": for measurement in measurements: measurement['end'] = measurement['date'] measurement['value'] /= 60.0 date = measurement['date'].strftime("%y-%m-%d") keys.append(date) measurementDictionary[date] = [measurement] ratings = questionAnalysis(ID) with open(os.path.join(csvDir, ID + ".csv"), "w") as csvFile: writer = csv.writer(csvFile, delimiter=",") writer.writerow(("Date", "Rating", "Duration")) intersectedDates = np.intersect1d([el[1] for el in ratings], keys) sleepDurationPerDate = OrderedDict() for date in np.union1d([el[1] for el in ratings], keys): filteredRatings = [el for el in ratings if el[1] == date] if date in keys: for measurement in measurementDictionary[date]: duration = round(measurement['value'], 2) if filteredRatings and duration != 0: # We found a rating for this date rating = filteredRatings[0][0] try: (rating, sleepDuration, dateObject) = sleepDurationPerDate[date] sleepDurationPerDate[date] = (rating, sleepDuration + duration, dateObject) except KeyError: sleepDurationPerDate[date] = (rating, duration, measurement['date']) writer.writerow(( measurement['end'].strftime("%y-%m-%d %H:%M:%S"), rating, duration )) else: # No rating found, write "-" as rating writer.writerow(( measurement['end'].strftime("%y-%m-%d %H:%M:%S"), "-", duration )) else: # This date had no measurement if filteredRatings: rating = ratings[0][0] writer.writerow(( date, rating, "-" )) else: writer.writerow(( date, "-", "-" )) alpha = 0.5 lineWidth = 0.15 s = 100 X = [] # contains the measurements Y = [] # contains the ratings for date in sleepDurationPerDate: (rating, totalSleepDuration, dateObject) = sleepDurationPerDate[date] X.append(totalSleepDuration) Y.append(rating) if rating == "Goed": plt.scatter(dateObject, totalSleepDuration, s=s, color="g", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Good day") elif rating == "Gemiddeld": plt.scatter(dateObject, totalSleepDuration, s=s, color="orange", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Average day") elif rating == "Slecht": plt.scatter(dateObject, totalSleepDuration, s=s, color="r", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Bad day") plt.gca().xaxis.set_major_formatter(DateFormatter('%y-%m-%d')) plt.gca().xaxis.set_major_locator(WeekdayLocator()) plt.setp(plt.gca().xaxis.get_majorticklabels(), rotation=40) plt.xlabel("Date") plt.ylabel("Sleep duration (hours)") fancyPlot(dateLimit=True) writeToPdf(ID, plotDir) (correlation, pvalue) = kendalltau(X[0:len(X) - 1], Y[1:]) if -1 <= correlation <= 1: print "Walking Analysis | ID: %s | pid : %s | correlation: %f, pvalue: %f" % ( ID, pid, correlation, pvalue) X = np.array(X) X = X.reshape(len(X), 1) Y = map(toNumerical, Y) Y = np.array(Y) model = mord.OrdinalLogistic() try: result = model.fit(X, Y) # Coefficient of the first feature (only one feature here) print "Coefficient | ", result.coef_ # print result.theta_ except Exception as ex: print ex pass
def heartRateAnalysis(): """ Analysis of the heart rates of our participants per day """ # heart rate has mtype 1 folder = "out/" csvDir = folder + "csv/Heart Rate" plotDir = folder + "plots/Heart Rate plots" boxPlotDir = folder + "plots/Heart Rate boxplots" measurementDictionary = OrderedDict() try: os.makedirs(csvDir) except OSError: pass # path already exists for (ID, pid, device) in basisPeak + fitBit: measurements = None if device == "basispeak": measurements = db.measurements.find({"pid": pid, "mtype": 1, "date": {"$gte": start, "$lt": end}}).sort( [("date", 1)]) measurements = list(measurements) elif device == "fitbit": measurements = db.diaFitBitPatients.find( {"pid": pid, "mtype": 1, "date": {"$gte": start, "$lt": end}}).sort( [("date", 1)]) measurements = list(measurements) elif device == "microsoftband": measurements = [] # The measurements for this device were not present in the database while writing this code keys = [] for key, group in groupby(measurements, lambda x: x["date"].strftime("%y-%m-%d")): # group the measurements based on date keys.append(key) measurementDictionary[key] = [el for el in group if isinstance(el["value"], int) and not math.isnan(el["value"]) and el["value"] == el["value"]] # Create a list witch contains list with all the measurement of the same date measurementsPerDate = [] dateObjects = [] for key in keys: dateMeasurement = [measurement["value"] for measurement in measurementDictionary[key]] measurementsPerDate.append(dateMeasurement) # Finding the date objects if dateMeasurement: newDate = measurementDictionary[key][0]['date'] if not dateObjects: dateObjects.append(newDate) else: similarDates = [date for date in dateObjects if date.year == newDate.year and date.month == newDate.month and date.day == newDate.day ] if not similarDates: dateObjects.append(newDate) # The boxplot fig = plt.figure() plt.xlabel("Date") plt.ylabel("Heart rate") plt.title("ID: %s" % ID) bp = plt.gca().boxplot(measurementsPerDate, patch_artist=True) fig.autofmt_xdate() plt.xticks(np.arange(1, len(keys) + 1), keys, rotation=45) fancyBoxPlot(bp) writeToPdf(ID, boxPlotDir) ratings = questionAnalysis(ID) with open(os.path.join(csvDir, ID + ".csv"), "w") as csvFile: writer = csv.writer(csvFile, delimiter=",") writer.writerow(("Date", "Rating", "Median", "Average", "Standard Deviation")) X = [] # contains the measurements Y = [] # contains the ratings intersectedDates = np.intersect1d([el[1] for el in ratings], keys) for (rating, date) in ratings: if date in intersectedDates: measurements = [measurement["value"] for measurement in measurementDictionary[date]] measurements = sorted(measurements) writer.writerow(( date, rating, np.median(measurements), round(np.average(measurements)), round(np.std(measurements), 2) )) measurements = measurements[0: int(len(measurements) * 0.05)] stableHeartRate = np.average(measurements) Y.append(rating) X.append(stableHeartRate) alpha = 0.5 lineWidth = 0.15 s = 100 dateObject = measurementDictionary[date][0]['date'] # Normal plotting if rating == "Goed": plt.scatter(dateObject, stableHeartRate, s=s, color="g", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Good day") elif rating == "Gemiddeld": plt.scatter(dateObject, stableHeartRate, s=s, color="orange", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Average day") elif rating == "Slecht": plt.scatter(dateObject, stableHeartRate, s=s, color="r", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Bad day") plt.gca().xaxis.set_major_formatter(DateFormatter('%y-%m-%d')) plt.gca().xaxis.set_major_locator(WeekdayLocator()) plt.setp(plt.gca().xaxis.get_majorticklabels(), rotation=40) plt.xlabel("Date") plt.ylabel("Heart rate") fancyPlot(dateLimit=True) writeToPdf(ID, plotDir) (correlation, pvalue) = kendalltau(X[0:len(X) - 1], Y[1:]) if -1 <= correlation <= 1: print "Walking Analysis | ID: %s | pid : %s | correlation: %f, pvalue: %f" % ( ID, pid, correlation, pvalue) # Use Ordinal Regression, see Mord # https://en.wikipedia.org/wiki/Ordinal_regression X = np.array(X) X = X.reshape(len(X), 1) Y = map(toNumerical, Y) Y = np.array(Y) model = mord.OrdinalLogistic() """ fit() calls threshold_fit(), which makes use of the optimize.minimize() method. To prevent the "Desired error not necessarily achieved due to precision loss" message, add add the following paramter to this function: optimize.minize(..., ..., method='Nelder-Mead') """ result = model.fit(X, Y)
def energyAnalysis(): """ Analysis of the energy analysis of our participants per day """ energyQuestionId = "HanWRjvZe8PiLvfD4" folder = "out/" plotDir = folder + "plots/Energy Analysis plots" try: os.makedirs(plotDir) except OSError: pass # path already exists for ID in IDs: experiments = db_mijnKwik.observations.find({"userId": ID, "questionId": energyQuestionId, "timestamp": {"$gte": start, "$lt": end}}).sort([("timestamp", 1)]) experiments = list(experiments) X = [] # The measurements Y = [] # The ratings keys = [] measurementDictionary = OrderedDict() ratings = questionAnalysis(ID) for key, group in groupby(experiments, lambda x: x["timestamp"].strftime("%y-%m-%d")): # group the measurements based on date keys.append(key) measurementDictionary[key] = [el for el in group] plt.xlabel("Date") plt.ylabel("Energy Level") alpha = 0.5 lineWidth = 0.15 s = 100 intersectedDate = np.intersect1d([el[1] for el in ratings], keys) for (rating, date) in ratings: if date in intersectedDate: measurements = measurementDictionary[date] dateObject = measurements[0]['timestamp'] energyRating = np.average([measurement['value'] for measurement in measurements]) X.append(energyRating) Y.append(rating) if rating == "Goed": plt.scatter(dateObject, energyRating, s=s, color="g", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Good day") elif rating == "Gemiddeld": plt.scatter(dateObject, energyRating, s=s, color="orange", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Average day") elif rating == "Slecht": plt.scatter(dateObject, energyRating, s=s, color="r", alpha=alpha, linewidth=lineWidth, edgecolor=almost_black, label="Bad day") else: print "Something went wrong. The value of rating is ", rating plt.gca().xaxis.set_major_formatter(DateFormatter('%y-%m-%d')) plt.gca().xaxis.set_major_locator(WeekdayLocator()) plt.setp(plt.gca().xaxis.get_majorticklabels(), rotation=40) fancyPlot(dateLimit=True) writeToPdf(ID, plotDir) """ To predict the next day, we need to drop certain elements in the list. Lets look at an example. X: 4,2,4,5 (The measurements) Y: G,A,B,A (The ratings) By dropping the last element of the "X" array and dropping the first element of the "Y" array we get the following lists: X: 4,2,4 Y: A,B,A This is exactly what we need! PS: we need to make sure that all the dates are consecutive """ (correlation, pvalue) = kendalltau(X[0:len(X) - 1], Y[1:]) if not math.isnan(correlation) and not math.isnan(pvalue): print "Walking Analysis | ID: %s | correlation: %f, pvalue: %f" % (ID, correlation, pvalue) X = np.array(X).T # Because energy rating is nominal data, we can use normal logistic regression logic = LogisticRegression() logic.fit(X.reshape(len(X), 1), Y) print logic.coef_ # Coefficient of the first feature (only one feature here)