def MergeSeriesCollections( series: typing.Iterator[ me_pb2.SeriesCollection]) -> me_pb2.SeriesCollection: """Merge the given series collections into a single SeriesCollection. Args: series: The SeriesCollection messages to merge. Returns: A SeriesCollection message. Raises: ValueError: If there are Series with duplicate names. """ series = list(labtypes.flatten(list(f.series) for f in series)) # Create a map from series name to a list of series protos. names_to_series = collections.defaultdict(list) [names_to_series[s.name].append(s) for s in series] # Concatenate each list of series with the same name. concatenated_series = [ ConcatenateSeries(s) for s in names_to_series.values() ] return me_pb2.SeriesCollection( series=sorted(concatenated_series, key=lambda s: s.name))
def ProcessXmlFile(path: pathlib.Path) -> me_pb2.SeriesCollection: """Process a HealthKit XML data export. Args: path: Path of the XML file. Returns: A SeriesCollection message. Raises: FileNotFoundError: If the requested file is not found. """ if not path.is_file(): raise FileNotFoundError(str(path)) try: return pbutil.RunProcessMessageInPlace( [ str( bazelutil.DataPath( "phd/datasets/me_db/providers/health_kit/xml_export_worker" )) ], me_pb2.SeriesCollection(source=str(path)), ) except subprocess.CalledProcessError as e: raise importers.ImporterError("HealthKit", path, str(e)) from e
def ProcessInbox(inbox: pathlib.Path) -> me_pb2.SeriesCollection: """Process a directory of YNAB data. Args: inbox: The inbox path. Returns: A SeriesCollection message. """ if not (inbox / "ynab").is_dir(): return me_pb2.SeriesCollection() files = (subprocess.check_output( ["find", "-L", str(inbox / "ynab"), "-name", "Budget.yfull"], universal_newlines=True, ).rstrip().split("\n")) # TODO(cec): There can be multiple directories for a single budget. Do we need # to de-duplicate them? files = [pathlib.Path(f) for f in files] series_collections = [] if files and files[0]: for file in files: series_collections.append(ProcessBudgetJsonFile(file)) return importers.MergeSeriesCollections(series_collections)
def ProcessCsvFile(path: pathlib.Path) -> me_pb2.SeriesCollection: """Process a LifeCycle CSV data export. Args: path: Path of the CSV file. Returns: A SeriesCollection message. Raises: FileNotFoundError: If the requested file is not found. """ if not path.is_file(): raise FileNotFoundError(str(path)) try: return pbutil.RunProcessMessageInPlace( [ str( bazelutil.DataPath( "phd/datasets/me_db/providers/life_cycle/lc_export_csv_worker" )) ], me_pb2.SeriesCollection(source=str(path)), ) except subprocess.CalledProcessError as e: raise importers.ImporterError("LifeCycle", path, str(e)) from e
def _ReadDatabaseToSeriesCollection(db) -> me_pb2.SeriesCollection: """Extract SeriesCollection from sqlite3 Timing.app database. Args: db: The sqlite3 database. Returns: A SeriesCollection message. """ cursor = db.cursor() # Construct a map from distinct Task.title columns to Series protos. cursor.execute('SELECT DISTINCT(title) FROM TASK') title_series_map = {row[0]: me_pb2.Series() for row in cursor.fetchall()} # Process data from each title separately. for title, series in title_series_map.items(): start_time = time.time() # Set the Series message fields. series.family = 'ScreenTime' # The name of a series is a CamelCaps version of the Task.title. E.g. 'Web'. series.name = "".join(title.title().split()) series.unit = 'milliseconds' # Run a query to aggregate columns data. The SQL engine can do all the heavy # lifting, with the only processing of data required being the conversion of # Application.title to CamelCaps. # TODO(cec): What time zone does Timing.app store results in? cursor.execute( """ SELECT CAST(ROUND(AppActivity.startDate * 1000.0) AS int) as date, CAST(ROUND((AppActivity.endDate - AppActivity.startDate) * 1000.0) AS int) as value, Application.title as `group` FROM AppActivity LEFT JOIN Application ON AppActivity.applicationID=AppActivity.id LEFT JOIN Task ON AppActivity.taskID=Task.id WHERE Task.title=? """, (title, )) # Create Measurement protos for each of the returned rows. series.measurement.extend([ me_pb2.Measurement( ms_since_unix_epoch=date, value=value, group="".join(group.title().split()) if group else "default", source='Timing.app', ) for date, value, group in cursor ]) logging.info('Processed %s %s:%s measurements in %.3f seconds', humanize.intcomma(len(series.measurement)), series.family, series.name, time.time() - start_time) return me_pb2.SeriesCollection(series=title_series_map.values())
def ProcessBudgetJsonFile(path: pathlib.Path) -> me_pb2.SeriesCollection: if not path.is_file(): raise FileNotFoundError(str(path)) try: return pbutil.RunProcessMessageInPlace([ str( bazelutil.DataPath( 'phd/datasets/me_db/providers/ynab/json_budget_worker')) ], me_pb2.SeriesCollection(source=str(path))) except subprocess.CalledProcessError as e: raise importers.ImporterError('LifeCycle', path, str(e)) from e
def ProcessInbox(inbox: pathlib.Path) -> me_pb2.SeriesCollection: """Process Timing.app data in an inbox. Args: inbox: The inbox path. Returns: A SeriesCollection message. """ # Do nothing is there is no Timing.app database. if not (inbox / "timing" / "SQLite.db").is_file(): return me_pb2.SeriesCollection() return ProcessDatabase(inbox / "timing" / "SQLite.db")
def ProcessInbox(inbox: pathlib.Path) -> me_pb2.SeriesCollection: """Process Life Cycle data in an inbox. Args: inbox: The inbox path. Returns: A SeriesCollection message. """ # Do nothing is there is no LC_export.zip file. if not (inbox / "life_cycle" / "LC_export.zip").is_file(): return me_pb2.SeriesCollection() with tempfile.TemporaryDirectory(prefix="phd_") as d: temp_csv = pathlib.Path(d) / "LC_export.csv" with zipfile.ZipFile(inbox / "life_cycle" / "LC_export.zip") as z: with z.open("LC_export.csv") as csv_in: with open(temp_csv, "wb") as f: f.write(csv_in.read()) return ProcessCsvFile(temp_csv)
def ProcessInbox(inbox: pathlib.Path) -> me_pb2.SeriesCollection: """Process a directory of HealthKit data. Args: inbox: The inbox path. Returns: A SeriesCollection message. """ # Do nothing is there is there's no HealthKit export.zip file. if not (inbox / "health_kit" / "export.zip").is_file(): return me_pb2.SeriesCollection() app.Log(1, "Unpacking %s", inbox / "health_kit" / "export.zip") with tempfile.TemporaryDirectory(prefix="phd_") as d: temp_xml = pathlib.Path(d) / "export.xml" with zipfile.ZipFile(inbox / "health_kit" / "export.zip") as z: with z.open("apple_health_export/export.xml") as xml_in: with open(temp_xml, "wb") as f: f.write(xml_in.read()) return ProcessXmlFile(temp_xml)
def ProcessInbox(inbox: pathlib.Path) -> me_pb2.SeriesCollection: """Process a directory of HealthKit data. Args: inbox: The inbox path. Returns: A SeriesCollection message. """ # Do nothing is there is there's no HealthKit export.zip file. if not (inbox / 'health_kit' / 'export.zip').is_file(): return me_pb2.SeriesCollection() logging.info('Unpacking %s', inbox / 'health_kit' / 'export.zip') with tempfile.TemporaryDirectory(prefix='phd_') as d: temp_xml = pathlib.Path(d) / 'export.xml' with zipfile.ZipFile(inbox / 'health_kit' / 'export.zip') as z: with z.open('apple_health_export/export.xml') as xml_in: with open(temp_xml, 'wb') as f: f.write(xml_in.read()) return ProcessXmlFile(temp_xml)