def extract_entries(self) -> Generator[Entry, None, None]: account_info = self.get_account_info() js_file_path = self.extracted_files_path / 'data/tweet.js' json_file_path = self.extracted_files_path / 'data/tweet.json' remove_twitter_js(js_file_path, json_file_path) with json_file_path.open('r', encoding='utf-8') as json_file: json_entries = [t['tweet'] for t in json.load(json_file)] logger.info(f"Adding tweets found in {str(json_file_path)}") for tweet in json_entries: entry = Entry( schema='social.twitter.tweet', title='', description=tweet['full_text'], date_on_timeline=twitter_date_to_datetime( tweet['created_at']), extra_attributes={ "post_id": tweet['id'], "post_user": account_info['username'], "source": self.entry_source, }, source=self.entry_source, ) if tweet.get('in_reply_to_status_id'): entry.extra_attributes['post_parent_id'] = tweet[ 'in_reply_to_status_id'] yield entry
def extract_entries(self) -> Generator[Entry, None, None]: for ics_file in self.get_archive_files(): with open(ics_file, 'r') as file: calendar = Calendar.from_ical(file.read()) for event in calendar.walk('VEVENT'): event_metadata = defaultdict(dict) event_metadata['event']['start_date'] = datetime_to_json(self.normalize_date(event['DTSTART'].dt)) if event.get('DTEND'): event_metadata['event']['end_date'] = datetime_to_json(self.normalize_date(event['DTEND'].dt)) if event.get('DTSTAMP'): event_metadata['event']['creation_date'] = datetime_to_json(self.normalize_date(event['DTSTAMP'].dt)) if event.get('LOCATION'): event_metadata['location']['name'] = event['LOCATION'] yield Entry( source=self.entry_source, schema='event', title=str(event.get('SUMMARY', '')), description=str(event.get('DESCRIPTION', '')), date_on_timeline=self.normalize_date(event['DTSTART'].dt), extra_attributes=dict(event_metadata), )
def extract_history_entries(self, json_files: Iterable[Path], schema: str, prefix: str) -> Generator[Entry, None, None]: for json_file in json_files: logger.info(f'Processing activity in "{str(json_file)}"') for entry in json.load(json_file.open('r')): if entry['title'].startswith(prefix): try: time = pytz.utc.localize(datetime.strptime(entry['time'], '%Y-%m-%dT%H:%M:%S.%fZ')) except ValueError: time = json_to_datetime(entry['time']) extra_attributes = {} if entry.get('titleUrl'): extra_attributes['url'] = entry['titleUrl'] try: yield Entry( title=entry['title'].replace(prefix, '', 1), description='', source=self.entry_source, schema=schema, date_on_timeline=time, extra_attributes=extra_attributes ) except KeyboardInterrupt: raise except: logging.exception(f"Could not parse entry: {entry}") raise
def entry_from_call(self, account: dict, chat: dict, message: dict) -> Entry: if message['actor_id'] == self.account_id(account): # Outgoing call caller1 = {'name': message['actor'], 'id': message['actor_id']} caller2 = { 'name': chat['name'], 'id': f"user{chat['id']}" } # The chat ID is the other user's ID else: # Incoming call caller1 = { 'name': self.account_name(account), 'id': self.account_id(account) } caller2 = {'name': message['actor'], 'id': message['actor_id']} return Entry( source=self.entry_source, schema='call.telegram', title='', description='', extra_attributes={ 'duration': message.get('duration_seconds', 0), # Not set for failed calls 'caller1_name': caller1['name'], 'caller1_id': caller1['id'], 'caller2_name': caller2['name'], 'caller2_id': caller2['id'], }, date_on_timeline=self.get_message_date(message), )
def extract_entries(self) -> Generator[Entry, None, None]: for json_file in self.get_archive_files(): json_entries = json.load(json_file) for json_entry in json_entries: json_entry['source'] = self.entry_source json_entry.pop('id', None) serializer = EntrySerializer(data=json_entry) assert serializer.is_valid() yield Entry(**serializer.validated_data)
def browsing_history_entry(date_on_timeline: datetime, archive: 'BaseArchive', url: str, title: str = '') -> Entry: return Entry( title=title or '', description='', schema='activity.browsing.website', source=archive.entry_source, extra_attributes={ 'url': url, }, date_on_timeline=date_on_timeline, )
def entry_from_point(self, point) -> Entry: return Entry(schema='activity.location', source=self.entry_source, title=getattr(point, 'name') or '', description=getattr(point, 'description') or getattr(point, 'comment') or '', extra_attributes={ 'location': { 'latitude': point.latitude, 'longitude': point.longitude, 'altitude': point.elevation, }, }, date_on_timeline=datetime_to_json(point.time))
def extract_entries(self) -> Generator[Entry, None, None]: default_currency = 'EUR' default_timezone = 'Europe/Berlin' # TODO: If this thing gets a million users, that assumption could be wrong income_types = ('Income', 'Direct Debit Reversal') for csv_file in self.get_archive_files(): for line in csv.DictReader(codecs.iterdecode(csv_file.open('rb'), 'utf-8'), delimiter=',', quotechar='"'): schema = 'finance.income' if line['Transaction type'] in income_types else 'finance.expense' you = { 'currency': default_currency, 'amount': Decimal(line['Amount (EUR)']).copy_abs(), 'name': None, } other_party = { 'currency': line['Type Foreign Currency'] or default_currency, 'amount': Decimal(line['Amount (Foreign Currency)'] or line['Amount (EUR)']).copy_abs(), 'name': line['Payee'], } sender = you if schema == 'finance.expense' else other_party recipient = other_party if schema == 'finance.expense' else you # The transactions don't have a time. Set it to noon, Berlin time entry_date = pytz.timezone(default_timezone)\ .localize(datetime.strptime(line['Date'], '%Y-%m-%d'))\ .replace(hour=12)\ .astimezone(pytz.UTC) yield Entry( schema=schema, source=self.entry_source, title=line['Transaction type'], description=line['Payment reference'], extra_attributes={ 'sender_amount': str(sender['amount']), 'sender_currency': sender['currency'], 'sender_name': sender['name'], 'recipient_amount': str(recipient['amount']), 'recipient_currency': recipient['currency'], 'recipient_name': recipient['name'], }, date_on_timeline=entry_date )
def entry_from_file_path(file_path: Path, source: BaseSource) -> Entry: """ Creates an Entry template from a file path, filling the fields with file metadata. """ mimetype = get_mimetype(file_path) entry = Entry( title=file_path.name, source=source.entry_source, schema=get_schema_from_mimetype(mimetype), extra_attributes={ 'file': { 'checksum': get_checksum(file_path), 'path': str(file_path.resolve()), 'mimetype': mimetype, }, }, ) entry.date_on_timeline = get_file_entry_date(entry) if mimetype: if mimetype.startswith('image/'): entry.schema = 'file.image' entry.extra_attributes.update( get_image_extra_attributes(file_path)) if mimetype.startswith('video/'): entry.schema = 'file.video' try: entry.extra_attributes.update( get_video_extra_attributes(file_path)) except FileFormatError: logger.exception( f"Could not read metadata for video {str(file_path)}") if mimetype.startswith('audio/'): entry.schema = 'file.audio' entry.extra_attributes.update( get_audio_extra_attributes(file_path)) if mimetype.startswith('text/'): entry.schema = 'file.text' with file_path.open('r') as text_file: entry.description = text_file.read( settings.MAX_PLAINTEXT_PREVIEW_SIZE) return entry
def process(self, force=False) -> Tuple[int, int]: filters = {} if self.author_name: filters['only_authors'] = [ self.author_name, ] commits = Repository(self.repo_url, **filters).traverse_commits() self.get_entries().delete() entries_to_create = [] for commit in commits: entries_to_create.append( Entry(title=commit.msg, description=commit.hash, date_on_timeline=commit.committer_date.astimezone( pytz.UTC), schema='commit', source=self.entry_source, extra_attributes={ 'hash': commit.hash, 'url': self.get_commit_url(commit), 'author': { 'email': commit.author.email, 'name': commit.author.name, }, 'changes': { 'files': commit.files, 'insertions': commit.insertions, 'deletions': commit.deletions, }, 'repo': { 'name': self.get_repo_name() or commit.project_name, 'url': self.get_repo_url(), }, })) Entry.objects.bulk_create(entries_to_create) return len(entries_to_create), 0
def extract_fit_history(self) -> Generator[Entry, None, None]: json_files = list((self.extracted_files_path / 'Takeout/Fit/All sessions/').glob('*.json')) logger.info(f'Processing fit history in "{self.entry_source}". ' f'{len(json_files)} files found.') for json_file in json_files: logger.info(f'Processing fit history entries in {str(json_file)}') with json_file.open(encoding='utf-8') as json_file_handle: json_entry = json.load(json_file_handle) try: time = pytz.utc.localize(datetime.strptime(json_entry['startTime'], '%Y-%m-%dT%H:%M:%S.%fZ')) except ValueError: time = json_to_datetime(json_entry['startTime']) try: activity = json_entry['fitnessActivity'] # remove any non-digit characters (e.g. 's' for seconds) duration_sec = re.sub('[^0-9.]','', json_entry['duration']) except: logging.exception(f"Could not parse entry: {json_entry}") raise # Extra attributes. "Heart minutes", step count, calories, distance, speed, active minutes extra_attributes = {} extra_attributes['duration'] = duration_sec for elem in json_entry.get('aggregate', []): # usually of form com.google.heart_minutes.summary - extract 3rd part key = elem['metricName'].split('.')[2] value = elem.get('floatValue') or elem.get('intValue') extra_attributes[key] = value yield Entry( title=activity, description='', source=self.entry_source, schema='activity.exercise.session', date_on_timeline=time, extra_attributes=extra_attributes )
def geolocation_entry(date_on_timeline: datetime, latitude: float, longitude: float, archive: 'BaseArchive', altitude: float = None, accuracy: int = None, title: str = '') -> Entry: entry = Entry( title=title or '', description='', schema='activity.location', source=archive.entry_source, extra_attributes={ 'location': { 'latitude': latitude, 'longitude': longitude, }, }, date_on_timeline=date_on_timeline, ) if altitude is not None: entry.extra_attributes['location']['altitude'] = altitude if accuracy is not None: entry.extra_attributes['location']['accuracy'] = accuracy return entry
def entries_from_message(self, message: dict, chat_title: str, chat_participants: Iterable) -> Entry: sender = message['sender_name'] recipient = None if message['sender_name'] != chat_title: # if it's from you to the other participant, or from any participant to a group chat recipient = chat_title else: # otherwise, it's an inbound message in a 2 person chat assert len(list(chat_participants)) == 2 for participant in chat_participants: if participant != message['sender_name']: recipient = participant assert recipient is not None if message['type'] == 'Call': return Entry( source=self.entry_source, schema='call.facebook', title='', description='', extra_attributes={ 'caller1_name': sender, 'caller1_id': sender, 'caller2_name': recipient, 'caller2_id': recipient, 'duration': message['call_duration'], }, date_on_timeline=self.message_date(message), ) else: message_date = self.message_date(message) message_metadata = { 'sender_name': sender, 'sender_id': sender, 'recipient_name': recipient, 'recipient_id': recipient, } # Each photo/video/audio/gif in a message is a distinct Entry for video in message.get('videos', []): if video['uri'].startswith('https://'): logger.warning(f"Ignoring video attachment {video['uri']}") continue yield self.entry_from_attachment( schema='message.facebook.video', file_path=video['uri'], date_on_timeline=message_date, extra_attributes=message_metadata) for photo in message.get('photos', []): if photo['uri'].startswith('https://'): logger.warning(f"Ignoring photo attachment {photo['uri']}") continue yield self.entry_from_attachment( schema='message.facebook.image', file_path=photo['uri'], date_on_timeline=message_date, extra_attributes=message_metadata) for file in message.get('files', []): if file['uri'].startswith('https://'): logger.warning(f"Ignoring file attachment {file['uri']}") continue yield self.entry_from_attachment( schema='message.facebook.file', file_path=file['uri'], date_on_timeline=message_date, extra_attributes=message_metadata) for gif in message.get('gifs', []): if gif['uri'].startswith('https://'): logger.warning(f"Ignoring gif attachment {gif['uri']}") continue yield self.entry_from_attachment( schema='message.facebook.gif', file_path=gif['uri'], date_on_timeline=message_date, extra_attributes=message_metadata) for audio in message.get('audio_files', []): if audio['uri'].startswith('https://'): logger.warning(f"Ignoring audio attachment {audio['uri']}") continue yield self.entry_from_attachment( schema='message.facebook.audio', file_path=audio['uri'], date_on_timeline=message_date, extra_attributes=message_metadata) if "sticker" in message: if message['sticker']['uri'].startswith('https://'): logger.warning( f"Ignoring sticker attachment {message['sticker']['uri']}" ) else: yield self.entry_from_attachment( schema='message.facebook.sticker', file_path=message['sticker']['uri'], date_on_timeline=message_date, extra_attributes=message_metadata) if message.get('content'): yield Entry( source=self.entry_source, schema='message.facebook', title='', description=message['content'], date_on_timeline=message_date, extra_attributes=message_metadata, )
def test_create_entry_without_shop(self): """test: should raise exceptions""" entry = Entry() with self.assertRaises(ValidationError): entry.full_clean()
return text def entry_from_message(self, account: dict, chat: dict, message: dict) -> Entry: if file := self.get_message_file_path(message): entry = entry_from_file_path(file, self) mimetype = entry.extra_attributes['file']['mimetype'] if mimetype and mimetype.startswith('audio'): entry.schema = 'message.telegram.audio' elif mimetype and mimetype.startswith('video'): entry.schema = 'message.telegram.video' elif mimetype and mimetype.startswith('image'): entry.schema = 'message.telegram.image' else: entry = Entry() if message.get('media_type') == 'sticker': entry.schema = 'message.telegram.sticker' elif message.get('media_type') == 'animation': entry.schema = 'message.telegram.gif' else: entry.schema = 'message.telegram' entry.source = self.entry_source entry.description = self.get_message_text(message) entry.date_on_timeline = self.get_message_date(message) # Set message metadata if chat['type'] == 'personal_chat': # For personal chats, messages are from one user to another user. # In the telegram data, the chat ID is the same as the other user's ID.
def create_entries_from_directory(path: Path, source: BaseSource, backup_date: datetime, use_cache=True) -> List[Entry]: """ Delete and recreate the Entries for the files in a directory. """ timelineinclude_rules = list( get_include_rules_for_dir(path, settings.TIMELINE_INCLUDE_FILE)) files = list( get_files_matching_rules(get_files_in_dir(path), timelineinclude_rules)) inode_checksum_cache = {} # translates file inodes to checksums metadata_cache = {} # translates checksums to entry metadata cached_extra_attributes = ('location', 'media', 'previews') if use_cache: # Most files in a directory already have a matching Entry. Recalculating the metadata for each file Entry is # wasteful and time-consuming. # Instead, we build a cache of all files that have an Entry. If we process a file that already has an Entry (if # they have the same inode), we can reuse the cached Entry metadata. for entry in source.get_entries(): try: # We also avoid calculating checksums if we don't have to. Instead, we compare the file inodes. If the # inodes are the same, THEN we calculate and compare the checksums. If the file in the Entry and the # file in the directory have the same checksum, then they're identical, and we can reuse the metadata. entry_file_inode = Path( entry.extra_attributes['file']['path']).stat().st_ino inode_checksum_cache[ entry_file_inode] = entry.extra_attributes['file'][ 'checksum'] except FileNotFoundError: # This can happen if the file in the Entry was deleted or moved. pass metadata = {} for attribute in cached_extra_attributes: if attribute in entry.extra_attributes: metadata[attribute] = entry.extra_attributes[attribute] if entry.description: metadata['description'] = entry.description metadata_cache[entry.extra_attributes['file'] ['checksum']] = metadata entries_to_create = [] for file in files: file.resolve() try: checksum = inode_checksum_cache.get( file.stat().st_ino) or get_checksum(file) except OSError: logger.exception(f"Could not generate checksum for {str(file)}") raise if checksum in metadata_cache: mimetype = get_mimetype(file) entry = Entry(title=file.name, source=source.entry_source, schema=get_schema_from_mimetype(mimetype), description=metadata_cache[checksum].get( 'description', ''), extra_attributes={ 'file': { 'path': str(file), 'checksum': checksum, 'mimetype': mimetype, }, }) for attribute in cached_extra_attributes: if attribute in metadata_cache[checksum]: entry.extra_attributes[attribute] = metadata_cache[ checksum][attribute] else: entry = entry_from_file_path(file, source) entry.extra_attributes['backup_date'] = datetime_to_json(backup_date) entry.date_on_timeline = get_file_entry_date( entry) # This could change, so it's not cached entries_to_create.append(entry) source.get_entries().delete( ) # TODO: Only delete the entries in the specified directory? return Entry.objects.bulk_create(entries_to_create)