Ejemplo n.º 1
0
def sync_collection_config_file(local_collections_path, collection_id, bucket):
    local_collection_path = get_collection_path(
        collection_id, collections_path=local_collections_path)
    os.makedirs(local_collection_path, exist_ok=True)

    local_collection_config_filepath = get_collection_config_filepath(
        collection_id, collections_path=local_collections_path)
    bucket_collection_config_filepath = get_collection_config_filepath(
        collection_id)

    # Collection config file
    aws_resource('s3').Bucket(bucket).download_file(
        bucket_collection_config_filepath, local_collection_config_filepath)
Ejemplo n.º 2
0
def sync_collection_config(local_collections_path, collection_id, bucket):
    local_collection_path = get_collection_path(
        collection_id, collections_path=local_collections_path)
    shutil.rmtree(local_collection_path, ignore_errors=True)
    os.makedirs(local_collection_path)

    local_collection_config_filepath = get_collection_config_filepath(
        collection_id, collections_path=local_collections_path)
    bucket_collection_config_filepath = get_collection_config_filepath(
        collection_id)

    # Collection config file
    aws_resource('s3').Bucket(bucket).download_file(
        bucket_collection_config_filepath, local_collection_config_filepath)
    # Lock
    try:
        local_lock_filepath = get_lock_file(
            collection_id, collections_path=local_collections_path)
        bucket_lock_filepath = get_lock_file(collection_id)
        aws_resource('s3').Bucket(bucket).download_file(
            bucket_lock_filepath, local_lock_filepath)
    except botocore.exceptions.ClientError as error:
        if error.response['Error']['Code'] != "404":
            raise
Ejemplo n.º 3
0
def get_collection_config(bucket, collection_id):
    return CollectionConfig(
        download_json(bucket, get_collection_config_filepath(collection_id)))
Ejemplo n.º 4
0
def _put_collection_config(bucket, collection_config):
    upload_json(bucket,
                get_collection_config_filepath(collection_config['id']),
                collection_config)
Ejemplo n.º 5
0
def collection_config_download_command(bucket, collection_id):
    download_file(bucket, get_collection_config_filepath(collection_id),
                  'collection.json')
    print('Downloaded to collection.json.')
 def setUp(self):
     self.collection_id = 'test_id'
     self.collections_path = mkdtemp()
     self.collection_config_filepath = get_collection_config_filepath(
         self.collection_id, collections_path=self.collections_path)
     self.write_collection_config()
Ejemplo n.º 7
0
    def harvest(self):
        log.info('Starting harvester')
        # Sync
        if self.bucket:
            sync_collection_config(self.collections_path, self.collection_id,
                                   self.bucket)

        # Check if collection is locked
        assert_locked(
            get_lock_file(self.collection_id,
                          collections_path=self.collections_path))

        # Start the server
        ServerThread(self.stop_event, self.stopped_event, self.shutdown_event,
                     self.harvest_info, self.port).start()

        # Start the monitor
        if self.monitor:
            MonitoringThread().start()

        # Load the collection config
        collection_config = self._load_collection_config()

        with S3FileMoverThread(self.file_queue, self.collections_path,
                               self.bucket), CollectionLock(
                                   self.collections_path,
                                   self.collection_id,
                                   self.file_queue,
                                   harvest_timestamp=self.harvest_timestamp):
            # Write the collection config file to harvester
            self._write_harvest_collection_config(collection_config)

            # Start collecting
            twarc_thread = TwarcThread(collection_config,
                                       self.collections_path,
                                       self.harvest_timestamp, self.file_queue,
                                       self.changeset, self.stop_event,
                                       self.harvest_info, self.tweets_per_file)
            twarc_thread.start()

            # Wait for collection to stop
            twarc_thread.join()
            if twarc_thread.exception:
                raise twarc_thread.exception

            # Save harvester info
            with FileQueueingWriter(
                    get_harvest_info_file(
                        self.collection_id,
                        self.harvest_timestamp,
                        collections_path=self.collections_path),
                    self.file_queue) as harvest_info_writer:
                harvest_info_writer.write_json(self.harvest_info.to_dict(),
                                               indent=2)
            if self.changeset.has_changes():
                # Sync again
                if self.bucket:
                    sync_collection_config_file(self.collections_path,
                                                self.collection_id,
                                                self.bucket)
                latest_collection_config = self._load_collection_config()
                if latest_collection_config.get('timestamp',
                                                1) != collection_config.get(
                                                    'timestamp', 2):
                    # If it has changed, then delete any updates from changeset for users that no longer exist.
                    log.debug('Cleaning changeset')
                    self.changeset.clean_changeset(latest_collection_config)
                # Merge changes into latest config
                latest_collection_config.merge_changeset(self.changeset)
                # Write config
                with FileQueueingWriter(
                        get_collection_config_filepath(
                            self.collection_id,
                            collections_path=self.collections_path),
                        self.file_queue) as changeset_writer:
                    changeset_writer.write_json(latest_collection_config,
                                                indent=2)

                # Write changeset
                change_timestamp = dateutil.parser.parse(
                    self.changeset['change_timestamp'])
                with FileQueueingWriter(
                        get_changeset_file(
                            self.collection_id,
                            change_timestamp,
                            collections_path=self.collections_path),
                        self.file_queue) as changeset_writer:
                    changeset_writer.write_json(self.changeset, indent=2)

        log.info('Harvesting stopped')
        # All done
        self.stopped_event.set()

        log.debug('Waiting to shut down')
        while not self.shutdown_event.is_set():
            sleep(.5)
        log.info('Shut down')
Ejemplo n.º 8
0
 def _load_collection_config(self):
     with open(
             get_collection_config_filepath(
                 self.collection_id,
                 collections_path=self.collections_path)) as config_file:
         return CollectionConfig(json.load(config_file))
Ejemplo n.º 9
0
def collection_exists(bucket, collection_id):
    return file_exists(bucket, get_collection_config_filepath(collection_id))