Esempio n. 1
0
    def extract_entries(self) -> Generator[Entry, None, None]:
        account_info = self.get_account_info()

        js_file_path = self.extracted_files_path / 'data/tweet.js'
        json_file_path = self.extracted_files_path / 'data/tweet.json'
        remove_twitter_js(js_file_path, json_file_path)

        with json_file_path.open('r', encoding='utf-8') as json_file:
            json_entries = [t['tweet'] for t in json.load(json_file)]

            logger.info(f"Adding tweets found in {str(json_file_path)}")
            for tweet in json_entries:
                entry = Entry(
                    schema='social.twitter.tweet',
                    title='',
                    description=tweet['full_text'],
                    date_on_timeline=twitter_date_to_datetime(
                        tweet['created_at']),
                    extra_attributes={
                        "post_id": tweet['id'],
                        "post_user": account_info['username'],
                        "source": self.entry_source,
                    },
                    source=self.entry_source,
                )

                if tweet.get('in_reply_to_status_id'):
                    entry.extra_attributes['post_parent_id'] = tweet[
                        'in_reply_to_status_id']

                yield entry
Esempio n. 2
0
    def extract_entries(self) -> Generator[Entry, None, None]:
        for ics_file in self.get_archive_files():
            with open(ics_file, 'r') as file:
                calendar = Calendar.from_ical(file.read())
                for event in calendar.walk('VEVENT'):
                    event_metadata = defaultdict(dict)
                    event_metadata['event']['start_date'] = datetime_to_json(self.normalize_date(event['DTSTART'].dt))

                    if event.get('DTEND'):
                        event_metadata['event']['end_date'] = datetime_to_json(self.normalize_date(event['DTEND'].dt))

                    if event.get('DTSTAMP'):
                        event_metadata['event']['creation_date'] = datetime_to_json(self.normalize_date(event['DTSTAMP'].dt))

                    if event.get('LOCATION'):
                        event_metadata['location']['name'] = event['LOCATION']

                    yield Entry(
                        source=self.entry_source,
                        schema='event',
                        title=str(event.get('SUMMARY', '')),
                        description=str(event.get('DESCRIPTION', '')),
                        date_on_timeline=self.normalize_date(event['DTSTART'].dt),
                        extra_attributes=dict(event_metadata),
                    )
Esempio n. 3
0
    def extract_history_entries(self, json_files: Iterable[Path], schema: str,
                                prefix: str) -> Generator[Entry, None, None]:
        for json_file in json_files:
            logger.info(f'Processing activity in "{str(json_file)}"')
            for entry in json.load(json_file.open('r')):
                if entry['title'].startswith(prefix):
                    try:
                        time = pytz.utc.localize(datetime.strptime(entry['time'], '%Y-%m-%dT%H:%M:%S.%fZ'))
                    except ValueError:
                        time = json_to_datetime(entry['time'])

                    extra_attributes = {}
                    if entry.get('titleUrl'):
                        extra_attributes['url'] = entry['titleUrl']

                    try:
                        yield Entry(
                            title=entry['title'].replace(prefix, '', 1),
                            description='',
                            source=self.entry_source,
                            schema=schema,
                            date_on_timeline=time,
                            extra_attributes=extra_attributes
                        )
                    except KeyboardInterrupt:
                        raise
                    except:
                        logging.exception(f"Could not parse entry: {entry}")
                        raise
Esempio n. 4
0
    def entry_from_call(self, account: dict, chat: dict,
                        message: dict) -> Entry:
        if message['actor_id'] == self.account_id(account):  # Outgoing call
            caller1 = {'name': message['actor'], 'id': message['actor_id']}
            caller2 = {
                'name': chat['name'],
                'id': f"user{chat['id']}"
            }  # The chat ID is the other user's ID
        else:  # Incoming call
            caller1 = {
                'name': self.account_name(account),
                'id': self.account_id(account)
            }
            caller2 = {'name': message['actor'], 'id': message['actor_id']}

        return Entry(
            source=self.entry_source,
            schema='call.telegram',
            title='',
            description='',
            extra_attributes={
                'duration': message.get('duration_seconds',
                                        0),  # Not set for failed calls
                'caller1_name': caller1['name'],
                'caller1_id': caller1['id'],
                'caller2_name': caller2['name'],
                'caller2_id': caller2['id'],
            },
            date_on_timeline=self.get_message_date(message),
        )
Esempio n. 5
0
 def extract_entries(self) -> Generator[Entry, None, None]:
     for json_file in self.get_archive_files():
         json_entries = json.load(json_file)
         for json_entry in json_entries:
             json_entry['source'] = self.entry_source
             json_entry.pop('id', None)
             serializer = EntrySerializer(data=json_entry)
             assert serializer.is_valid()
             yield Entry(**serializer.validated_data)
Esempio n. 6
0
def browsing_history_entry(date_on_timeline: datetime, archive: 'BaseArchive', url: str, title: str = '') -> Entry:
    return Entry(
        title=title or '',
        description='',
        schema='activity.browsing.website',
        source=archive.entry_source,
        extra_attributes={
            'url': url,
        },
        date_on_timeline=date_on_timeline,
    )
Esempio n. 7
0
 def entry_from_point(self, point) -> Entry:
     return Entry(schema='activity.location',
                  source=self.entry_source,
                  title=getattr(point, 'name') or '',
                  description=getattr(point, 'description')
                  or getattr(point, 'comment') or '',
                  extra_attributes={
                      'location': {
                          'latitude': point.latitude,
                          'longitude': point.longitude,
                          'altitude': point.elevation,
                      },
                  },
                  date_on_timeline=datetime_to_json(point.time))
Esempio n. 8
0
    def extract_entries(self) -> Generator[Entry, None, None]:
        default_currency = 'EUR'
        default_timezone = 'Europe/Berlin'  # TODO: If this thing gets a million users, that assumption could be wrong
        income_types = ('Income', 'Direct Debit Reversal')

        for csv_file in self.get_archive_files():
            for line in csv.DictReader(codecs.iterdecode(csv_file.open('rb'), 'utf-8'), delimiter=',', quotechar='"'):
                schema = 'finance.income' if line['Transaction type'] in income_types else 'finance.expense'

                you = {
                    'currency': default_currency,
                    'amount': Decimal(line['Amount (EUR)']).copy_abs(),
                    'name': None,
                }

                other_party = {
                    'currency': line['Type Foreign Currency'] or default_currency,
                    'amount': Decimal(line['Amount (Foreign Currency)'] or line['Amount (EUR)']).copy_abs(),
                    'name': line['Payee'],
                }

                sender = you if schema == 'finance.expense' else other_party
                recipient = other_party if schema == 'finance.expense' else you

                # The transactions don't have a time. Set it to noon, Berlin time
                entry_date = pytz.timezone(default_timezone)\
                    .localize(datetime.strptime(line['Date'], '%Y-%m-%d'))\
                    .replace(hour=12)\
                    .astimezone(pytz.UTC)

                yield Entry(
                    schema=schema,
                    source=self.entry_source,
                    title=line['Transaction type'],
                    description=line['Payment reference'],
                    extra_attributes={
                        'sender_amount': str(sender['amount']),
                        'sender_currency': sender['currency'],
                        'sender_name': sender['name'],
                        'recipient_amount': str(recipient['amount']),
                        'recipient_currency': recipient['currency'],
                        'recipient_name': recipient['name'],
                    },
                    date_on_timeline=entry_date
                )
Esempio n. 9
0
def entry_from_file_path(file_path: Path, source: BaseSource) -> Entry:
    """
    Creates an Entry template from a file path, filling the fields with file metadata.
    """
    mimetype = get_mimetype(file_path)
    entry = Entry(
        title=file_path.name,
        source=source.entry_source,
        schema=get_schema_from_mimetype(mimetype),
        extra_attributes={
            'file': {
                'checksum': get_checksum(file_path),
                'path': str(file_path.resolve()),
                'mimetype': mimetype,
            },
        },
    )
    entry.date_on_timeline = get_file_entry_date(entry)

    if mimetype:
        if mimetype.startswith('image/'):
            entry.schema = 'file.image'
            entry.extra_attributes.update(
                get_image_extra_attributes(file_path))
        if mimetype.startswith('video/'):
            entry.schema = 'file.video'
            try:
                entry.extra_attributes.update(
                    get_video_extra_attributes(file_path))
            except FileFormatError:
                logger.exception(
                    f"Could not read metadata for video {str(file_path)}")
        if mimetype.startswith('audio/'):
            entry.schema = 'file.audio'
            entry.extra_attributes.update(
                get_audio_extra_attributes(file_path))
        if mimetype.startswith('text/'):
            entry.schema = 'file.text'
            with file_path.open('r') as text_file:
                entry.description = text_file.read(
                    settings.MAX_PLAINTEXT_PREVIEW_SIZE)

    return entry
Esempio n. 10
0
    def process(self, force=False) -> Tuple[int, int]:
        filters = {}
        if self.author_name:
            filters['only_authors'] = [
                self.author_name,
            ]

        commits = Repository(self.repo_url, **filters).traverse_commits()

        self.get_entries().delete()

        entries_to_create = []
        for commit in commits:
            entries_to_create.append(
                Entry(title=commit.msg,
                      description=commit.hash,
                      date_on_timeline=commit.committer_date.astimezone(
                          pytz.UTC),
                      schema='commit',
                      source=self.entry_source,
                      extra_attributes={
                          'hash': commit.hash,
                          'url': self.get_commit_url(commit),
                          'author': {
                              'email': commit.author.email,
                              'name': commit.author.name,
                          },
                          'changes': {
                              'files': commit.files,
                              'insertions': commit.insertions,
                              'deletions': commit.deletions,
                          },
                          'repo': {
                              'name': self.get_repo_name()
                              or commit.project_name,
                              'url': self.get_repo_url(),
                          },
                      }))
        Entry.objects.bulk_create(entries_to_create)
        return len(entries_to_create), 0
Esempio n. 11
0
    def extract_fit_history(self) -> Generator[Entry, None, None]:
        json_files = list((self.extracted_files_path / 'Takeout/Fit/All sessions/').glob('*.json'))
        logger.info(f'Processing fit history in "{self.entry_source}". '
                    f'{len(json_files)} files found.')

        for json_file in json_files:
            logger.info(f'Processing fit history entries in {str(json_file)}')
            with json_file.open(encoding='utf-8') as json_file_handle:
                json_entry = json.load(json_file_handle)

            try:
                time = pytz.utc.localize(datetime.strptime(json_entry['startTime'], '%Y-%m-%dT%H:%M:%S.%fZ'))
            except ValueError:
                time = json_to_datetime(json_entry['startTime'])

            try:
                activity = json_entry['fitnessActivity']
                # remove any non-digit characters (e.g. 's' for seconds)
                duration_sec = re.sub('[^0-9.]','', json_entry['duration']) 
            except:
                logging.exception(f"Could not parse entry: {json_entry}")
                raise

            # Extra attributes. "Heart minutes", step count, calories, distance, speed, active minutes
            extra_attributes = {}
            extra_attributes['duration'] = duration_sec
            for elem in json_entry.get('aggregate', []):
                    # usually of form com.google.heart_minutes.summary - extract 3rd part
                    key = elem['metricName'].split('.')[2]
                    value = elem.get('floatValue') or elem.get('intValue')
                    extra_attributes[key] = value

            yield Entry(
                title=activity,
                description='',
                source=self.entry_source,
                schema='activity.exercise.session',
                date_on_timeline=time,
                extra_attributes=extra_attributes
            )
Esempio n. 12
0
def geolocation_entry(date_on_timeline: datetime, latitude: float, longitude: float, archive: 'BaseArchive',
                      altitude: float = None, accuracy: int = None, title: str = '') -> Entry:
    entry = Entry(
        title=title or '',
        description='',
        schema='activity.location',
        source=archive.entry_source,
        extra_attributes={
            'location': {
                'latitude': latitude,
                'longitude': longitude,
            },
        },
        date_on_timeline=date_on_timeline,
    )

    if altitude is not None:
        entry.extra_attributes['location']['altitude'] = altitude

    if accuracy is not None:
        entry.extra_attributes['location']['accuracy'] = accuracy

    return entry
Esempio n. 13
0
    def entries_from_message(self, message: dict, chat_title: str,
                             chat_participants: Iterable) -> Entry:
        sender = message['sender_name']
        recipient = None

        if message['sender_name'] != chat_title:
            # if it's from you to the other participant, or from any participant to a group chat
            recipient = chat_title
        else:
            # otherwise, it's an inbound message in a 2 person chat
            assert len(list(chat_participants)) == 2
            for participant in chat_participants:
                if participant != message['sender_name']:
                    recipient = participant

        assert recipient is not None

        if message['type'] == 'Call':
            return Entry(
                source=self.entry_source,
                schema='call.facebook',
                title='',
                description='',
                extra_attributes={
                    'caller1_name': sender,
                    'caller1_id': sender,
                    'caller2_name': recipient,
                    'caller2_id': recipient,
                    'duration': message['call_duration'],
                },
                date_on_timeline=self.message_date(message),
            )
        else:
            message_date = self.message_date(message)
            message_metadata = {
                'sender_name': sender,
                'sender_id': sender,
                'recipient_name': recipient,
                'recipient_id': recipient,
            }

            # Each photo/video/audio/gif in a message is a distinct Entry
            for video in message.get('videos', []):
                if video['uri'].startswith('https://'):
                    logger.warning(f"Ignoring video attachment {video['uri']}")
                    continue
                yield self.entry_from_attachment(
                    schema='message.facebook.video',
                    file_path=video['uri'],
                    date_on_timeline=message_date,
                    extra_attributes=message_metadata)

            for photo in message.get('photos', []):
                if photo['uri'].startswith('https://'):
                    logger.warning(f"Ignoring photo attachment {photo['uri']}")
                    continue
                yield self.entry_from_attachment(
                    schema='message.facebook.image',
                    file_path=photo['uri'],
                    date_on_timeline=message_date,
                    extra_attributes=message_metadata)

            for file in message.get('files', []):
                if file['uri'].startswith('https://'):
                    logger.warning(f"Ignoring file attachment {file['uri']}")
                    continue
                yield self.entry_from_attachment(
                    schema='message.facebook.file',
                    file_path=file['uri'],
                    date_on_timeline=message_date,
                    extra_attributes=message_metadata)

            for gif in message.get('gifs', []):
                if gif['uri'].startswith('https://'):
                    logger.warning(f"Ignoring gif attachment {gif['uri']}")
                    continue
                yield self.entry_from_attachment(
                    schema='message.facebook.gif',
                    file_path=gif['uri'],
                    date_on_timeline=message_date,
                    extra_attributes=message_metadata)

            for audio in message.get('audio_files', []):
                if audio['uri'].startswith('https://'):
                    logger.warning(f"Ignoring audio attachment {audio['uri']}")
                    continue
                yield self.entry_from_attachment(
                    schema='message.facebook.audio',
                    file_path=audio['uri'],
                    date_on_timeline=message_date,
                    extra_attributes=message_metadata)

            if "sticker" in message:
                if message['sticker']['uri'].startswith('https://'):
                    logger.warning(
                        f"Ignoring sticker attachment {message['sticker']['uri']}"
                    )
                else:
                    yield self.entry_from_attachment(
                        schema='message.facebook.sticker',
                        file_path=message['sticker']['uri'],
                        date_on_timeline=message_date,
                        extra_attributes=message_metadata)

            if message.get('content'):
                yield Entry(
                    source=self.entry_source,
                    schema='message.facebook',
                    title='',
                    description=message['content'],
                    date_on_timeline=message_date,
                    extra_attributes=message_metadata,
                )
Esempio n. 14
0
 def test_create_entry_without_shop(self):
     """test: should raise exceptions"""
     entry = Entry()
     with self.assertRaises(ValidationError):
         entry.full_clean()
Esempio n. 15
0
        return text

    def entry_from_message(self, account: dict, chat: dict,
                           message: dict) -> Entry:
        if file := self.get_message_file_path(message):
            entry = entry_from_file_path(file, self)
            mimetype = entry.extra_attributes['file']['mimetype']
            if mimetype and mimetype.startswith('audio'):
                entry.schema = 'message.telegram.audio'
            elif mimetype and mimetype.startswith('video'):
                entry.schema = 'message.telegram.video'
            elif mimetype and mimetype.startswith('image'):
                entry.schema = 'message.telegram.image'
        else:
            entry = Entry()
            if message.get('media_type') == 'sticker':
                entry.schema = 'message.telegram.sticker'
            elif message.get('media_type') == 'animation':
                entry.schema = 'message.telegram.gif'
            else:
                entry.schema = 'message.telegram'

        entry.source = self.entry_source
        entry.description = self.get_message_text(message)
        entry.date_on_timeline = self.get_message_date(message)

        # Set message metadata
        if chat['type'] == 'personal_chat':
            # For personal chats, messages are from one user to another user.
            # In the telegram data, the chat ID is the same as the other user's ID.
Esempio n. 16
0
def create_entries_from_directory(path: Path,
                                  source: BaseSource,
                                  backup_date: datetime,
                                  use_cache=True) -> List[Entry]:
    """
    Delete and recreate the Entries for the files in a directory.
    """
    timelineinclude_rules = list(
        get_include_rules_for_dir(path, settings.TIMELINE_INCLUDE_FILE))
    files = list(
        get_files_matching_rules(get_files_in_dir(path),
                                 timelineinclude_rules))

    inode_checksum_cache = {}  # translates file inodes to checksums
    metadata_cache = {}  # translates checksums to entry metadata
    cached_extra_attributes = ('location', 'media', 'previews')
    if use_cache:
        # Most files in a directory already have a matching Entry. Recalculating the metadata for each file Entry is
        # wasteful and time-consuming.
        # Instead, we build a cache of all files that have an Entry. If we process a file that already has an Entry (if
        # they have the same inode), we can reuse the cached Entry metadata.
        for entry in source.get_entries():
            try:
                # We also avoid calculating checksums if we don't have to. Instead, we compare the file inodes. If the
                # inodes are the same, THEN we calculate and compare the checksums. If the file in the Entry and the
                # file in the directory have the same checksum, then they're identical, and we can reuse the metadata.
                entry_file_inode = Path(
                    entry.extra_attributes['file']['path']).stat().st_ino
                inode_checksum_cache[
                    entry_file_inode] = entry.extra_attributes['file'][
                        'checksum']
            except FileNotFoundError:
                # This can happen if the file in the Entry was deleted or moved.
                pass

            metadata = {}

            for attribute in cached_extra_attributes:
                if attribute in entry.extra_attributes:
                    metadata[attribute] = entry.extra_attributes[attribute]

            if entry.description:
                metadata['description'] = entry.description

            metadata_cache[entry.extra_attributes['file']
                           ['checksum']] = metadata

    entries_to_create = []
    for file in files:
        file.resolve()

        try:
            checksum = inode_checksum_cache.get(
                file.stat().st_ino) or get_checksum(file)
        except OSError:
            logger.exception(f"Could not generate checksum for {str(file)}")
            raise

        if checksum in metadata_cache:
            mimetype = get_mimetype(file)
            entry = Entry(title=file.name,
                          source=source.entry_source,
                          schema=get_schema_from_mimetype(mimetype),
                          description=metadata_cache[checksum].get(
                              'description', ''),
                          extra_attributes={
                              'file': {
                                  'path': str(file),
                                  'checksum': checksum,
                                  'mimetype': mimetype,
                              },
                          })

            for attribute in cached_extra_attributes:
                if attribute in metadata_cache[checksum]:
                    entry.extra_attributes[attribute] = metadata_cache[
                        checksum][attribute]
        else:
            entry = entry_from_file_path(file, source)

        entry.extra_attributes['backup_date'] = datetime_to_json(backup_date)

        entry.date_on_timeline = get_file_entry_date(
            entry)  # This could change, so it's not cached
        entries_to_create.append(entry)

    source.get_entries().delete(
    )  # TODO: Only delete the entries in the specified directory?
    return Entry.objects.bulk_create(entries_to_create)