コード例 #1
0
class PostprocessPredictionsTask(luigi.Task):
    """Postprocess prediction from fairseq."""

    subset = luigi.Parameter()
    model = luigi.EnumParameter(enum=ModelType, description='Model type')
    result_path = luigi.Parameter('./outputs')

    def output(self) -> luigi.Target:
        return luigi.LocalTarget(
            Path(self.result_path) / self.model.name / self.subset / 'out.tsv')

    def requires(self) -> Dict[str, luigi.Task]:
        requirements = {
            'model':
            DownloadModelTask(model=self.model),
            'predictions':
            GeneratePredictionsTask(model=self.model, subset=self.subset),
        }
        return requirements

    @property
    def processor(self):
        if not hasattr(self, '__processor'):
            self.__processor = self.requires()['model'].get_processor()
        return self.__processor

    def extract_predictions(self,
                            line: str,
                            property_names: Optional[List[str]] = None
                            ) -> Tuple[int, List[str]]:
        items = line.strip().split('\t')

        line_idx = int(items[0][2:])
        tokens = items[2].strip().split(' ')

        decoded_text = self.processor.decode_pieces(tokens)

        predictions = []
        for prediction in decoded_text.split('###'):
            if property_names:
                property_name = property_names[line_idx]
                prediction = f'{property_name}_:_{prediction.strip()}'

            if len(prediction.strip().replace(' ', '_').split('_:_')) == 2:
                name, value = prediction.strip().replace(' ',
                                                         '_').split('_:_',
                                                                    maxsplit=1)
                predictions.append(f'{name}={value}')
            else:
                if prediction:
                    predictions.append(prediction.strip().replace(' ', '_'))
        return line_idx, predictions

    def aggregate_by_article(
            self, predictions: Dict[int, List[str]]) -> Dict[int, List[str]]:
        preprocess_task = self.requires()['predictions'].requires(
        )['data'].requires()['prepare-data']
        with open(preprocess_task.output()['indices'].path) as index_file:
            indices = [int(line.strip()) for line in index_file]

        aggregated_predictions: DefaultDict[int, List[str]] = defaultdict(list)
        for line_idx, doc_idx in enumerate(indices):
            aggregated_predictions[doc_idx].extend(predictions[line_idx])
        return aggregated_predictions

    def remove_duplicates(self, predictions: DefaultDict[int, List[str]]):
        for idx in predictions:
            predictions[idx] = list(set(predictions[idx]))

    def read_property_names(self) -> List[str]:
        preprocess_task = self.requires()['predictions'].requires(
        )['data'].requires()['prepare-data']

        with open(
                preprocess_task.output()['property_names'].path) as name_file:
            names = [line.strip() for line in name_file]
        return names

    def run(self):
        property_names = self.read_property_names(
        ) if self.model == ModelType.T5 else None

        final_predictions = defaultdict(list)
        with open(self.input()['predictions'].path) as generated_file:
            for line in generated_file:
                if line.startswith('H-'):
                    line_idx, predictions = self.extract_predictions(
                        line, property_names)
                    final_predictions[line_idx] = predictions

        if self.model == ModelType.T5:
            final_predictions = self.aggregate_by_article(final_predictions)

        self.remove_duplicates(final_predictions)

        # if self.filter_extra_properties:
        property_names = []
        with gzip.open(
                Path('dataset') / 'wikireading-recycled' / self.subset /
                'in.tsv.gz', 'rt') as source_file:
            for line in source_file:
                property_names.append(line.strip().split('\t')[0].split(' '))

        for doc_idx in range(max(final_predictions.keys()) + 1):
            final_predictions[doc_idx] = [
                prediction for prediction in final_predictions[doc_idx]
                if prediction.split('=')[0] in property_names[doc_idx]
            ]

        with open(self.output().path, 'wt') as out_file:
            for i in range(max(final_predictions.keys()) + 1):
                out_file.write(' '.join(sorted(final_predictions[i])) + '\n')
コード例 #2
0
class EvaluateModelTask(luigi.Task):
    model = luigi.EnumParameter(enum=ModelType)
    subset = luigi.ChoiceParameter(
        choices=[
            'all', 'unseen', 'rare', 'categorical', 'relational',
            'exact-match', 'long-articles'
        ],
        default='all',
        var_type=str,
    )
    split = luigi.ChoiceParameter(
        choices=['dev-0', 'test-A', 'test-B'],
        default='test-B',
        var_type=str,
    )

    def requires(self):
        if self.subset in ('exact-match', 'long-articles'):
            subset_to_generate = f'{self.split}-{self.subset}'
        else:
            subset_to_generate = self.split
        requirements = {
            'predictions':
            PostprocessPredictionsTask(model=self.model,
                                       subset=subset_to_generate),
            'dataset':
            DownloadDatasetTask(),
        }
        return requirements

    def output(self):
        if self.subset == 'all':
            full_subset_name = self.split
        else:
            full_subset_name = f'{self.split}-{self.subset}'

        return luigi.LocalTarget(
            Path('./results') / self.model.name / full_subset_name)

    def run(self):
        Path(self.output().path).parent.mkdir(parents=True, exist_ok=True)

        if self.subset == 'all':
            properties = None
            reference_file = Path(
                self.input()['dataset'].path) / self.split / 'expected.tsv'
        elif self.subset in ('exact-match', 'long-articles'):
            properties = None
            reference_file = Path(self.input(
            )['dataset'].path) / f'{self.split}-{self.subset}' / 'expected.tsv'
        else:
            properties = (Path(self.input()['dataset'].path) /
                          f'{self.split}-{self.subset}.properties').open()
            reference_file = Path(
                self.input()['dataset'].path) / self.split / 'expected.tsv'

        evaluate(
            prediction_file=Path(self.input()['predictions'].path).open(),
            reference_file=reference_file.open(),
            separator='=',
            output_file=Path(self.output().path).open('w'),
            metric='mean-F1',
            properties=properties,
            ignore_case=False,
        )
コード例 #3
0
class DownloadModelTask(luigi.Task):
    output_path = luigi.Parameter(
        './models',
        description=
        'the path where the dataset will be downloaded and extracted')
    model = luigi.EnumParameter(enum=ModelType, description='Model type')

    def output(self):
        out = {}
        if self.model == ModelType.T5:
            out['dict'] = luigi.LocalTarget(
                Path(self.output_path) / 't5' / 'dict.txt')
            out['model'] = luigi.LocalTarget(
                Path(self.output_path) / 't5' / 't5_best.pt')
            out['sentencepiece'] = luigi.LocalTarget(
                Path(self.output_path) / 't5' / 'sentencepiece.model')
        elif self.model == ModelType.DUAL_ROBERTA_TRANSFORMER:
            out['dict'] = luigi.LocalTarget(
                Path(self.output_path) / 'dual-source-roberta' / 'dict.txt')
            out['model'] = luigi.LocalTarget(
                Path(self.output_path) / 'dual-source-roberta' /
                'roberta_best.pt')
            out['vocab.bpe'] = luigi.LocalTarget(
                Path(self.output_path) / 'dual-source-roberta' / 'vocab.bpe')
            out['encoder.json'] = luigi.LocalTarget(
                Path(self.output_path) / 'dual-source-roberta' /
                'encoder.json')
        elif self.model == ModelType.DUAL_SOURCE_TRANSFORMER:
            out['dict'] = luigi.LocalTarget(
                Path(self.output_path) / 'dual-source-transformer' /
                'dict.txt')
            out['model'] = luigi.LocalTarget(
                Path(self.output_path) / 'dual-source-transformer' /
                'vanilla_best.pt')
            out['sentencepiece'] = luigi.LocalTarget(
                Path(self.output_path) / 'dual-source-transformer' /
                'spm.model')

        return out

    def requires(self):
        return None

    def run(self):
        logger.info(
            f'Downloading {self.model} model dataset to {self.output_path}')
        urls = {
            ModelType.DUAL_ROBERTA_TRANSFORMER:
            'https://applica-public.s3-eu-west-1.amazonaws.com/multi-property-extraction/fairseq-models/dual-source-roberta.tar.gz',
            ModelType.DUAL_SOURCE_TRANSFORMER:
            'https://applica-public.s3-eu-west-1.amazonaws.com/multi-property-extraction/fairseq-models/dual-source-transformer.tar.gz',
            ModelType.T5:
            'https://applica-public.s3-eu-west-1.amazonaws.com/multi-property-extraction/fairseq-models/t5.tar.gz',
        }
        response = response = requests.get(urls[self.model], stream=True)
        obj = io.BytesIO(response.content)
        tarfile.TarFile(mode='r',
                        fileobj=gzip.GzipFile(fileobj=obj,
                                              mode='rb')).extractall(
                                                  self.output_path)

    def get_processor(self):
        if self.model is ModelType.DUAL_SOURCE_TRANSFORMER:
            return SentencePieceProcessor(
                self.output()['sentencepiece'].path,
                tokens_to_end=['▁###'],
                tokens_to_ignore=[],
            )
        elif self.model is ModelType.DUAL_ROBERTA_TRANSFORMER:
            return RobertaProcessor(self.output()['encoder.json'].path,
                                    self.output()['vocab.bpe'].path)
        elif self.model is ModelType.T5:
            return SentencePieceProcessor(
                path=self.output()['sentencepiece'].path,
                tokens_to_end=['▁#'],
                tokens_to_ignore=['##'],
            )
        else:
            raise Exception(f'Unsupported vocab type: "{self.vocab_type}".')
コード例 #4
0
class SynapsePipelineTask(luigi.Task):
    '''The synapse pipeline task processes the synapses in a volume'''
    
    task_namespace = "ariadne_microns_pipeline"
    
    volume = VolumeParameter(description="The volume to process")
    output_location = luigi.Parameter(
        description="Directory for the segmentation .h5 file")
    temp_dirs = luigi.ListParameter(
        description="Locations for temp files")
    classifier_location= luigi.Parameter(
        description="Location for the classifier pickle file")
    experiment = luigi.Parameter(
        description="The Butterfly experiment that produced the dataset")
    sample = luigi.Parameter(
        description="The ID of the biological sample that was imaged")
    dataset = luigi.Parameter(
        description="The name of the volume that was imaged")
    channel = luigi.Parameter(
        default="raw",
        description="The name of the channel from which we take data")
    #
    # Optional parameters
    #
    butterfly_url = luigi.Parameter(
        default="http://localhost:2001/api",
        description="The URL for the butterfly server")
    block_width = luigi.IntParameter(
        default=2048,
        description="The width of a block")
    block_height = luigi.IntParameter(
        default=2048,
        description="The height of a block")
    block_depth = luigi.IntParameter(
        default=100,
        description="The depth of a block")
    xy_overlap = luigi.IntParameter(
        default=50,
        description="The amount of overlap between blocks in the x and y "
        "directions.")
    z_overlap = luigi.IntParameter(
        default=10,
        description="The amount of overlap between blocks in the z direction.")
    synapse_class_name = luigi.Parameter(
        default="synapse",
        description="The name of the synapse class in the classifier")
    #
    # FindSynapsesTask parameters
    #
    synapse_xy_erosion = luigi.IntParameter(
        default=4,
        description = "# of pixels to erode the neuron segmentation in the "
                      "X and Y direction prior to synapse segmentation.")
    synapse_z_erosion = luigi.IntParameter(
        default=1,
        description = "# of pixels to erode the neuron segmentation in the "
                      "Z direction prior to synapse segmentation.")
    synapse_xy_sigma = luigi.FloatParameter(
        description="Sigma for smoothing Gaussian for synapse segmentation "
                     "in the x and y directions.",
        default=1)
    synapse_z_sigma = luigi.FloatParameter(
        description="Sigma for smoothing Gaussian for symapse segmentation "
                     "in the z direction.",
        default=.5)
    synapse_min_size_2d = luigi.IntParameter(
        default=100,
        description="Remove isolated synapse foreground in a plane if "
        "less than this # of pixels")
    synapse_max_size_2d = luigi.IntParameter(
        default=15000,
        description = "Remove large patches of mislabeled synapse in a plane "
        "that have an area greater than this")
    synapse_min_size_3d = luigi.IntParameter(
        default=500,
        description = "Minimum size in voxels of a synapse")
    min_synapse_depth = luigi.IntParameter(
        default=5,
        description="Minimum acceptable size of a synapse in the Z direction")
    synapse_threshold = luigi.FloatParameter(
        description="Threshold for synapse voxels vs background voxels",
        default=128.)
    #
    # connected components parameters
    #
    joining_method = luigi.EnumParameter(
        enum=JoiningMethod,
        default=JoiningMethod.PAIRWISE_MULTIMATCH,
        description="Algorithm to use to join neuroproofed segmentation blocks")
    min_percent_connected = luigi.FloatParameter(
        default=75.0,
        description="Minimum overlap required to join segments across blocks")
    min_overlap_volume = luigi.IntParameter(
        default=1000,
        description="The minimum # of voxels of overlap between two objects "
                    "required to join them across blocks")
    max_poly_matches = luigi.IntParameter(
        default=1)
    dont_join_orphans = luigi.BoolParameter()
    orphan_min_overlap_ratio = luigi.FloatParameter(
        default=0.9)
    orphan_min_overlap_volume = luigi.IntParameter(
        default=1000,
        description="The minimum # of voxels of overlap needed to join "
                    "an orphan segment")
    halo_size_xy = luigi.IntParameter(
        default=5,
        description="The number of pixels on either side of the origin to "
                    "use as context when extracting the slice to be joined, "
                    "joining slices in the x and y directions")
    halo_size_z = luigi.IntParameter(
        default=1,
        description="The number of pixels on either side of the origin to "
                    "use as context when extracting the slice to be joined, "
                    "joining slices in the z direction")
    
    
    def output(self):
        return luigi.LocalTarget(self.output_location+".done")
    
    def run(self):
        with self.output().open("w") as fd:
            fd.write("Done")
    
    def get_dirs(self, x, y, z):
        '''Return a directory suited for storing a file with the given offset
        
        Create a hierarchy of directories in order to limit the number
        of files in any one directory.
        '''
        return [os.path.join(temp_dir,
                             self.experiment,
                             self.sample,
                             self.dataset,
                             self.channel,
                             str(x),
                             str(y),
                             str(z)) for temp_dir in self.temp_dirs]
    
    def get_pattern(self, dataset_name):
        return "{x:09d}_{y:09d}_{z:09d}_"+dataset_name
    
    def get_dataset_location(self, volume, dataset_name):
        return DatasetLocation(self.get_dirs(volume.x, volume.y, volume.z),
                               dataset_name,
                               self.get_pattern(dataset_name))
     
    def requires(self):
        self.compute_requirements()
        return self.requirements
    
    def compute_requirements(self):
        if hasattr(self, "requirements"):
            return
        try:
            rh_logger.logger.report_event("Assembling pipeline")
        except:
            rh_logger.logger.start_process("Ariadne pipeline",
                                           "Assembling pipeline")
            #
            # Configuration turns off the luigi-interface logger
            #
        import logging
        logging.getLogger("luigi-interface").disabled = False
        
        self.task_factory = AMTaskFactory()
        rh_logger.logger.report_event(
            "Loading classifier from %s" % self.classifier_location)
        self.pixel_classifier = PixelClassifierTarget(self.classifier_location)
        self.compute_coordinates()
        self.compute_block_requirements()
        self.compute_stitching_requirements()
        
    def compute_coordinates(self):
        '''Compute the coordinates of the blocks'''
        self.n_x = int(np.ceil(float(self.volume.width) / self.block_width))
        self.n_y = int(np.ceil(float(self.volume.height) / self.block_height))
        self.n_z = int(np.ceil(float(self.volume.depth) / self.block_depth))
        x = np.linspace(self.volume.x, self.volume.x1, self.n_x+1).astype(int)
        self.xs = x[:-1]
        self.xe = x[1:]
        y = np.linspace(self.volume.y, self.volume.y1, self.n_y+1).astype(int)
        self.ys = y[:-1]
        self.ye = y[1:]
        z = np.linspace(self.volume.z, self.volume.z1, self.n_z+1).astype(int)
        self.zs = z[:-1]
        self.ze = z[1:]
    
    def compute_block_requirements(self):
        self.segmentation_tasks = \
            np.zeros((self.n_z, self.n_y, self.n_x), object)
        for zi in range(self.n_z):
            for yi in range(self.n_y):
                for xi in range(self.n_x):
                    self.segmentation_tasks[zi, yi, xi] = \
                        self.compute_block_requirement(xi, yi, zi)
    
    def compute_block_requirement(self, xi, yi, zi):
        x0 = self.xs[xi]
        x1 = self.xe[xi]
        y0 = self.ys[yi]
        y1 = self.ye[yi]
        z0 = self.zs[zi]
        z1 = self.ze[zi]
        
        # Account for overlap
        
        if x0 != self.volume.x:
            x0 -= self.xy_overlap
        if x1 != self.volume.x1:
            x1 += self.xy_overlap
        if y0 != self.volume.y:
            y0 -= self.xy_overlap
        if y1 != self.volume.y1:
            y1 += self.xy_overlap
        if z0 != self.volume.z:
            z0 -= self.z_overlap
        if z1 != self.volume.z:
            z1 += self.z_overlap
        
        volume = Volume(x0, y0, z0, x1 - x0, y1 - y0, z1 - z0)
        
        #
        # Get the classifier input block coordinates
        #
        classifier_xpad = self.pixel_classifier.classifier.get_x_pad()
        classifier_ypad = self.pixel_classifier.classifier.get_y_pad()
        classifier_zpad = self.pixel_classifier.classifier.get_z_pad()
        
        cx0 = x0 - classifier_xpad
        cx1 = x1 + classifier_xpad
        cy0 = y0 - classifier_ypad
        cy1 = y1 + classifier_ypad
        cz0 = z0 - classifier_zpad
        cz1 = z1 + classifier_zpad
        
        classifier_input_volume = Volume(
            cx0, cy0, cz0, cx1 - cx0, cy1 - cy0, cz1 - cz0)
        #
        # The dataset locations
        #
        dl_butterfly = self.get_dataset_location(classifier_input_volume,
                                                 "image")
        dl_synapse = self.get_dataset_location(volume, "synapse-prediction")
        dl_segmentation = self.get_dataset_location(
            volume, "synapse-segmentation")
        #
        # Pipeline flow is Butterfly -> classifier -> shim -> find synapses
        #
        
        btask = self.task_factory.gen_get_volume_task(
            experiment=self.experiment,
            sample=self.sample,
            dataset=self.dataset,
            channel=self.channel,
            url=self.butterfly_url,
            volume=classifier_input_volume,
            location=dl_butterfly)
        
        paths = self.get_dirs(x0, y0, z0)
        ctask = self.task_factory.gen_classify_task(
            paths=paths, 
            datasets={self.synapse_class_name:"synapse-prediction"}, 
            pattern=self.get_pattern("synapse-prediction"),
            img_volume=btask.volume,
            img_location=btask.output().dataset_location,
            classifier_path=self.classifier_location)
        ctask.set_requirement(btask)
        
        shim_task = ClassifyShimTask.make_shim(
            classify_task=ctask,
            dataset_name="synapse-prediction")
        
        find_synapses_task = self.task_factory.gen_find_synapses_task(
            volume=volume,
            syn_location=shim_task.output().dataset_location,
            neuron_segmentation=EMPTY_DATASET_LOCATION,
            erosion_xy=self.synapse_xy_erosion,
            erosion_z=self.synapse_z_erosion,
            sigma_xy=self.synapse_xy_sigma,
            sigma_z=self.synapse_z_sigma,
            threshold=self.synapse_threshold,
            min_size_2d=self.synapse_min_size_2d,
            max_size_2d=self.synapse_max_size_2d,
            min_size_3d=self.synapse_min_size_3d,
            min_slice=self.min_synapse_depth,
            output_location=dl_segmentation)
        find_synapses_task.set_requirement(shim_task)
        return find_synapses_task
    
    def compute_stitching_requirements(self):
        '''Compute the tasks needed to stitch the blocks'''
        #
        # Pipeline is 
        # block -> 
        #      x-connections / y-connections / z-connections ->
        # all-connected-components ->
        # stitch segmentation
        #
        cc_tasks = []
        #
        # The x-blocks
        #
        for xi in range(self.n_x-1):
            for yi in range(self.n_y):
                for zi in range(self.n_z):
                    cc_tasks.append(
                        self.compute_x_connected_components_task(xi, yi, zi))
        #
        # The y-blocks
        #
        for yi in range(self.n_y-1):
            for xi in range(self.n_x):
                for yi in range(self.n_z):
                    cc_tasks.append(
                        self.compute_y_connected_components_task(xi, yi, zi))
        #
        # The z-blocks
        #
        for zi in range(self.n_z-1):
            for xi in range(self.n_x):
                for yi in range(self.n_y):
                    cc_tasks.append(
                        self.compute_z_connected_components_task(xi, yi, zi))
        #
        # The all-connected-components task
        #
        acc_location = os.path.join(
            self.get_dirs(self.xs[0], self.ys[0], self.zs[0])[0],
            "connectivity-graph.json")
        if len(cc_tasks) > 0:
            acc_task = self.task_factory.gen_all_connected_components_task(
                [_.output().path for _ in cc_tasks],
                acc_location)
            for task in cc_tasks:
                acc_task.set_requirement(task)
        else:
            # only one block - do a fake connected components
            seg_tgt = self.segmentation_tasks[0, 0, 0].output()
            acc_task = FakeAllConnectedComponentsTask(
                volume=seg_tgt.volume,
                location=seg_tgt.dataset_location,
                output_location=acc_location)
        for task in self.segmentation_tasks.flatten():
            acc_task.set_requirement(task)
        #
        # The stitching task
        #
        output_location = DatasetLocation(
            [self.output_location], 
            "synapse_segmentation",
            self.get_pattern("synapse_segmentation"))
        stask = self.task_factory.gen_stitch_segmentation_task(
             [], acc_task.output().path, self.volume, output_location)
        stask.x_padding = self.xy_overlap / 2
        stask.y_padding = self.xy_overlap / 2
        stask.z_padding = self.z_overlap / 2
        stask.set_requirement(acc_task)
        self.requirements = [stask]
    
    def configure_connected_components_task(self, task):
        task.joining_method = self.joining_method
        task.min_overlap_percent = self.min_percent_connected
        task.min_overlap_volume = self.min_overlap_volume
        task.max_poly_matches = self.max_poly_matches
        task.dont_join_orphans = self.dont_join_orphans
        task.orphan_min_overlap_ratio = self.orphan_min_overlap_ratio
        task.orphan_min_overlap_volume = self.orphan_min_overlap_volume
        
    def compute_x_connected_components_task(self, xi, yi, zi):
        task1 = self.segmentation_tasks[zi, yi, xi]
        tgt1 = task1.output()
        task2 = self.segmentation_tasks[zi, yi, xi+1]
        tgt2 = task2.output()
        y0 = max(tgt1.volume.y, tgt2.volume.y)
        y1 = min(tgt1.volume.y1, tgt2.volume.y1)
        z0 = max(tgt1.volume.z, tgt2.volume.z)
        z1 = min(tgt1.volume.z1, tgt2.volume.z1)
        overlap_volume = Volume(
            (tgt1.volume.x1 + tgt2.volume.x) / 2 - self.halo_size_xy / 2,
            y0, z0,
            self.halo_size_xy, y1-y0, z1-z0)
        
        output_location = os.path.join(
            self.get_dirs(tgt1.x, tgt1.y, tgt1.z)[0],
            "connected-components-x.json")
        cctask = self.task_factory.gen_connected_components_task(
            volume1=tgt1.volume,
            location1=tgt1.dataset_location,
            volume2=tgt2.volume,
            location2=tgt2.dataset_location,
            overlap_volume=overlap_volume,
            output_location=output_location)
        self.configure_connected_components_task(cctask)
        cctask.set_requirement(task1)
        cctask.set_requirement(task2)
        return cctask

    def compute_y_connected_components_task(self, xi, yi, zi):
        task1 = self.segmentation_tasks[zi, yi, xi]
        tgt1 = task1.output()
        task2 = self.segmentation_tasks[zi, yi+1, xi]
        tgt2 = task2.output()
        x0 = max(tgt1.volume.x, tgt2.volume.x)
        x1 = min(tgt1.volume.x1, tgt2.volume.x1)
        z0 = max(tgt1.volume.z, tgt2.volume.z)
        z1 = min(tgt1.volume.z1, tgt2.volume.z1)
        overlap_volume = Volume(
            x0, (tgt1.volume.y1 + tgt2.volume.y) / 2 - self.halo_size_xy / 2, z0,
            x1 - x0, self.halo_size_xy, z1-z0)
        
        output_location = os.path.join(
            self.get_dirs(tgt1.x, tgt1.y, tgt1.z)[0],
            "connected-components-y.json")
        cctask = self.task_factory.gen_connected_components_task(
            volume1=tgt1.volume,
            location1=tgt1.dataset_location,
            volume2=tgt2.volume,
            location2=tgt2.dataset_location,
            overlap_volume=overlap_volume,
            output_location=output_location)
        self.configure_connected_components_task(cctask)
        cctask.set_requirement(task1)
        cctask.set_requirement(task2)
        return cctask

    def compute_z_connected_components_task(self, xi, yi, zi):
        task1 = self.segmentation_tasks[zi, yi, xi]
        tgt1 = task1.output()
        task2 = self.segmentation_tasks[zi+1, yi, xi]
        tgt2 = task2.output()
        x0 = max(tgt1.volume.x, tgt2.volume.x)
        x1 = min(tgt1.volume.x1, tgt2.volume.x1)
        y0 = max(tgt1.volume.y, tgt2.volume.y)
        y1 = min(tgt1.volume.y1, tgt2.volume.y1)
        overlap_volume = Volume(
            x0, y0, (tgt1.volume.z1 + tgt2.volume.z) / 2 - self.halo_size_z / 2,
            x1 - x0, y1 - y0, self.halo_size_z)
        
        output_location = os.path.join(
            self.get_dirs(tgt1.x, tgt1.y, tgt1.z)[0],
            "connected-components-z.json")
        cctask = self.task_factory.gen_connected_components_task(
            volume1=tgt1.volume,
            location1=tgt1.dataset_location,
            volume2=tgt2.volume,
            location2=tgt2.dataset_location,
            overlap_volume=overlap_volume,
            output_location=output_location)
        self.configure_connected_components_task(cctask)
        cctask.set_requirement(task1)
        cctask.set_requirement(task2)
        return cctask
コード例 #5
0
 class Bar(RunOnceTask):
     eparam = luigi.EnumParameter(enum=Color)
コード例 #6
0
 class Baz(RunOnceTask):
     eparam = luigi.EnumParameter(enum=Color)
     another_param = luigi.IntParameter()
コード例 #7
0
ファイル: base.py プロジェクト: leifdenby/genesis
class PerformObjectTracking2D(luigi.Task):
    base_name = luigi.Parameter()
    tracking_type = luigi.EnumParameter(enum=TrackingType)
    timestep_interval = luigi.ListParameter(default=[])
    U_offset = luigi.ListParameter(default=[])
    run_in_temp_dir = luigi.BoolParameter(default=True)

    def requires(self):
        if REGEX_INSTANTENOUS_BASENAME.match(self.base_name):
            raise Exception("Shouldn't pass base_name with timestep suffix"
                            " (`.tn`) to tracking util")

        required_vars = uclales_2d_tracking.get_required_vars(
            tracking_type=self.tracking_type)

        return [
            TimeCrossSectionSlices2D(base_name=self.base_name,
                                     var_name=var_name)
            for var_name in required_vars
        ]

    def run(self):
        meta = _get_dataset_meta_info(self.base_name)

        if len(self.timestep_interval) == 0:
            tn_start = 0
            N_timesteps = {
                input.fn: int(input.open().time.count())
                for input in self.input()
            }
            if len(set(N_timesteps.values())) == 1:
                tn_end = list(N_timesteps.values())[0] - 1
            else:
                s_files = "\n\t".join([
                    "{fn}: {N}".format(fn=k, N=v)
                    for (k, v) in N_timesteps.items()
                ])
                raise Exception(
                    "The input files required for tracking don't currently have"
                    " the same number of timesteps, maybe some of them need"
                    " recreating? Required files and number of timesteps:\n"
                    f"\n\t{s_files}")
        else:
            tn_start, tn_end = self.timestep_interval

        if tn_start != 0:
            warnings.warn("There is currently a bug in the cloud-tracking "
                          "code which causes it to crash when not starting "
                          "at time index 0 (fortran index 1). Setting "
                          "tn_start=0")
            tn_start = 0

        if meta.get("no_tracking_calls", False):
            filename = Path(self.output().fn).name
            p_source = Path(meta["path"])
            p_source_tracking = p_source / "tracking_output" / filename

            if p_source_tracking.exists():
                Path(self.output().fn).parent.mkdir(parents=True,
                                                    exist_ok=True)
                os.symlink(p_source_tracking, self.output().fn)
            else:
                raise Exception(
                    "Automatic tracking calls have been disabled and"
                    f" couldn't find tracking output."
                    " Please run tracking utility externally and place output"
                    f" in `{p_source_tracking}`")

        else:
            dataset_name = meta["experiment_name"]

            if self.run_in_temp_dir:
                tempdir = tempfile.TemporaryDirectory()
                p_data = Path(tempdir.name)
                # symlink the source data files to the temporary directory
                for input in self.input():
                    os.symlink(
                        Path(input.fn).absolute(),
                        p_data / Path(input.fn).name)
                fn_track = f"{dataset_name}.out.xy.track.nc"
                # and the file for the tracking tool to write to
                Path(self.output().fn).parent.mkdir(exist_ok=True,
                                                    parents=True)
                os.symlink(
                    Path(self.output().fn).absolute(), p_data / fn_track)
            else:
                p_data = Path(self.input()[0].fn).parent

            fn_tracking = uclales_2d_tracking.call(
                data_path=p_data,
                dataset_name=dataset_name,
                tn_start=tn_start + 1,
                tn_end=tn_end,
                tracking_type=self.tracking_type,
                U_offset=self.U_offset,
            )

            if not self.run_in_temp_dir:
                Path(self.output().fn).parent.mkdir(exist_ok=True,
                                                    parents=True)
                shutil.move(fn_tracking, self.output().fn)

    def output(self):
        type_id = uclales_2d_tracking.TrackingType.make_identifier(
            self.tracking_type)

        if len(self.timestep_interval) == 0:
            interval_id = "__all__"
        else:
            tn_start, tn_end = self.timestep_interval
            interval_id = "{}__{}".format(tn_start, tn_end)

        if self.U_offset:
            offset_s = "u{}_v{}_offset".format(*self.U_offset)
        else:
            offset_s = "no_offset"

        FN_2D_FORMAT = ("{experiment_name}.tracking.{type_id}"
                        ".{interval_id}.{offset}.nc")

        meta = _get_dataset_meta_info(self.base_name)
        experiment_name = meta["experiment_name"]
        fn = FN_2D_FORMAT.format(
            experiment_name=experiment_name,
            type_id=type_id,
            interval_id=interval_id,
            offset=offset_s,
        )

        p = get_workdir() / self.base_name / "tracking_output" / fn
        return XArrayTargetUCLALESTracking(str(p))
コード例 #8
0
ファイル: etl_tasks.py プロジェクト: dckc/grouse
class SqlScriptTask(DBAccessTask):
    '''Task to run a stylized SQL script.

    As seen in `script_lib`, a script may require (in the luigi sense)
    other scripts and it  is complete iff its last query says so.

    Running a script may be parameterized with bind params and/or
    Oracle sqlplus style defined `&&variables`:

    >>> variables = dict(I2B2STAR='I2B2DEMODATA', CMS_RIF='CMS',
    ...                  cms_source_cd='X', bene_id_source='b')
    >>> txform = SqlScriptTask(
    ...    account='sqlite:///', passkey=None,
    ...    script=Script.cms_patient_mapping,
    ...    param_vars=variables)

    >>> [task.script for task in txform.requires()]
    ... #doctest: +ELLIPSIS
    [<Package(cms_keys)>]

    >>> txform.complete()
    False

    '''
    script = cast(Script, luigi.EnumParameter(enum=Script))
    param_vars = cast(Environment, luigi.DictParameter(default={}))
    _log = logging.getLogger('sql_scripts')  # ISSUE: ambient. magic-string

    @property
    def variables(self) -> Environment:
        '''Defined variables for this task (or task family).
        '''
        return self.param_vars

    @property
    def vars_for_deps(self) -> Environment:
        '''Defined variables to supply to dependencies.
        '''
        return self.variables

    def requires(self) -> List[luigi.Task]:
        '''Wrap each of `self.script.deps()` in a SqlScriptTask.
        '''
        return [
            SqlScriptTask(script=s,
                          param_vars=self.vars_for_deps,
                          account=self.account,
                          passkey=self.passkey,
                          echo=self.echo) for s in self.script.deps()
        ]

    def log_info(self) -> Dict[str, Any]:
        '''Include script, filename in self.log_info().
        '''
        return dict(DBAccessTask.log_info(self),
                    script=self.script.name,
                    filename=self.script.fname)

    def complete(self) -> bool:
        '''Each script's last query tells whether it is complete.

        It should be a scalar query that returns non-zero for done
        and either zero or an error for not done.
        '''
        last_query = self.last_query()
        params = params_used(self.complete_params(), last_query)
        with self.connection(event=self.task_family + ' complete query: ' +
                             self.script.name) as conn:
            try:
                result = conn.scalar(sql_text(last_query), params)
                return bool(result)
            except DatabaseError as exc:
                conn.log.warning('%(event)s: %(exc)s',
                                 dict(event='complete query error', exc=exc))
                return False

    def last_query(self) -> SQL:
        """
        Note: In order to support run-only variables as in UploadTask,
              we skip statements with unbound &&variables.
        """
        return self.script.statements(skip_unbound=True,
                                      variables=self.variables)[-1]

    def complete_params(self) -> Dict[str, Any]:
        '''Make `task_id` available to complete query as a bind param.
        '''
        return dict(task_id=self.task_id)

    def run(self) -> None:
        '''Run each statement in the script without any bind parameters.
        '''
        self.run_bound()

    def run_bound(self, script_params: Opt[Params] = None) -> None:
        '''Run with a (default emtpy) set of parameters bound.
        '''
        with self.connection(event='run script') as conn:
            self.run_event(conn, script_params=script_params)

    def run_event(self,
                  conn: LoggedConnection,
                  run_vars: Opt[Environment] = None,
                  script_params: Opt[Params] = None) -> int:
        '''Run script inside a LoggedConnection event.

        @param run_vars: variables to define for this run
        @param script_params: parameters to bind for this run
        @return: count of rows bulk-inserted
                 always 0 for this class, but see UploadTask

        To see how a script can ignore errors, see :mod:`script_lib`.
        '''
        bulk_rows = 0
        ignore_error = False
        run_params = dict(script_params or {}, task_id=self.task_id)
        fname = self.script.fname
        variables = dict(run_vars or {}, **self.variables)
        each_statement = self.script.each_statement(variables=variables)

        for line, _comment, statement in each_statement:
            try:
                if self.is_bulk(statement):
                    bulk_rows = self.bulk_insert(conn, fname, line, statement,
                                                 run_params, bulk_rows)
                else:
                    ignore_error = self.execute_statement(
                        conn, fname, line, statement, run_params, ignore_error)
            except DatabaseError as exc:
                db = self._dbtarget().engine
                err = SqlScriptError(exc, self.script, line, statement,
                                     str(db))
                if ignore_error:
                    conn.log.warning('%(event)s: %(error)s',
                                     dict(event='ignore', error=err))
                else:
                    raise err from None
        if bulk_rows > 0:
            conn.step.msg_parts.append(' %(rowtotal)s total rows')
            conn.step.argobj.update(dict(rowtotal=bulk_rows))

        return self.loaded_record(conn, bulk_rows)

    def loaded_record(self, conn: LoggedConnection, bulk_rows: int) -> int:
        return bulk_rows

    def execute_statement(self, conn: LoggedConnection, fname: str, line: int,
                          statement: SQL, run_params: Params,
                          ignore_error: bool) -> bool:
        '''Log and execute one statement.
        '''
        sqlerror = Script.sqlerror(statement)
        if sqlerror is not None:
            return sqlerror
        params = params_used(run_params, statement)
        self.set_status_message('%s:%s:\n%s\n%s' %
                                (fname, line, statement, params))
        conn.execute(statement, params)
        return ignore_error

    def is_bulk(self, statement: SQL) -> bool:
        '''always False for this class, but see UploadTask
        '''
        return False

    def bulk_insert(self, conn: LoggedConnection, fname: str, line: int,
                    statement: SQL, run_params: Params, bulk_rows: int) -> int:
        raise NotImplementedError(
            'overriding is_bulk() requires overriding bulk_insert()')
コード例 #9
0
class NeuroproofRunMixin:
    neuroproof = luigi.Parameter(
        description="Location of the neuroproof_graph_predict binary")
    neuroproof_ld_library_path = luigi.Parameter(
        description="Library paths to Neuroproof's shared libraries. "
        "This should include paths to CILK stdc++ libraries, Vigra libraries, "
        "JSONCPP libraries, and OpenCV libraries.")
    classifier_filename = luigi.Parameter(
        description="The Vigra random forest classifier or OpenCV random "
        "forest agglomeration classifier. In addition, there may be a file "
        "with the given filename with \"_ignore.txt\" appended which gives "
        "the indices of the features to ignore and similarly a file with "
        "\"_config.json\" appended which gives configuration information to "
        "neuroproof.")
    threshold = luigi.FloatParameter(
        default=0.2, description="Segmentation threshold for neuroproof")
    watershed_threshold = luigi.IntParameter(
        default=0,
        description="Threshold used for removing small bodies as a "
        "post-processing step")
    neuroproof_version = luigi.EnumParameter(
        enum=NeuroproofVersion,
        default=NeuroproofVersion.MIT,
        description="The command-line convention to be used to run the "
        "Neuroproof binary")

    def ariadne_run(self):
        '''Run the neuroproof subprocess'''
        if self.neuroproof_version == NeuroproofVersion.MINIMAL:
            self.run_standard()
        elif self.neuroproof_version == NeuroproofVersion.FLY_EM:
            self.run_optimized_with_copy()
        elif self.neuroproof_version == NeuroproofVersion.FAST:
            self.run_fast()
        else:
            self.run_optimized()

    def run_standard(self):
        '''Run the out-of-the-box neuroproof'''
        #
        # Write the segmentation and membrane probabilities to one
        # big temporary hdf5 file
        #
        prob_volume = DestVolumeReader(self.prob_loading_plan_path)
        seg_volume = DestVolumeReader(self.input_seg_loading_plan_path)
        additional_maps = \
            [DestVolumeReader(_) for _ in self.additional_loading_plan_paths]
        h5file = tempfile.mktemp(".h5")
        probfile = tempfile.mktemp(".h5")
        rh_logger.logger.report_event("Neuroproof watershed: %s" % h5file)
        rh_logger.logger.report_event("Neuroproof probabilities: %s" %
                                      probfile)
        pool = multiprocessing.Pool(2)
        seg_result = pool.apply_async(write_seg_volume,
                                      args=(h5file, seg_volume,
                                            "segmentation"))
        duplicate = None if len(additional_maps) > 0 else False
        prob_result = pool.apply_async(write_prob_volume,
                                       args=(prob_volume, additional_maps,
                                             probfile, "probabilities", False,
                                             duplicate))
        pool.close()
        pool.join()
        seg_result.get()
        prob_result.get()
        outfile = tempfile.mktemp(".h5")
        rh_logger.logger.report_event("Neuroproof output: %s" % outfile)

        try:
            args = [
                self.neuroproof, "-threshold",
                str(self.threshold), "-algorithm", "1", "-nomito",
                "-min_region_sz", "0", "-watershed", h5file, "segmentation",
                "-prediction", probfile, "probabilities", "-output", outfile,
                "segmentation", "-classifier", self.classifier_filename
            ]
            rh_logger.logger.report_event(" ".join(args))

            #
            # Inject the custom LD_LIBRARY_PATH into the subprocess environment
            #
            env = os.environ.copy()
            if "LD_LIBRARY_PATH" in env:
                ld_library_path = self.neuroproof_ld_library_path + os.pathsep +\
                    env["LD_LIBRARY_PATH"]
            else:
                ld_library_path = self.neuroproof_ld_library_path
            env["LD_LIBRARY_PATH"] = ld_library_path
            self.configure_env(env)
            #
            # Do the dirty deed...
            #
            subprocess.check_call(args, env=env, close_fds=True)
            #
            # There's an apparent bug in Neuroproof where it writes
            # the output to "fo.h5" for example, when you've asked it
            # to send the output to "foo.h5"
            #
            alt_outfile = os.path.splitext(outfile)[0][:-1] + ".h5"
            if (not os.path.exists(outfile)) and os.path.exists(alt_outfile):
                outfile = alt_outfile
            #
            # Finish the output volume
            #
            with h5py.File(outfile, "r") as fd:
                self.output().imwrite(fd["segmentation"][:].astype(np.uint32))
        finally:
            os.remove(h5file)
            os.remove(probfile)
            if os.path.exists(outfile):
                os.remove(outfile)

    def run_optimized_with_copy(self):
        '''Run the MIT neuroproof, but copying everything'''
        inputs = self.input()
        prob_volume = inputs.next()
        seg_volume = inputs.next()
        additional_maps = list(inputs)
        h5file = tempfile.mktemp(".h5")
        probfile = tempfile.mktemp(".h5")
        rh_logger.logger.report_event("Neuroproof watershed: %s" % h5file)
        rh_logger.logger.report_event("Neuroproof probabilities: %s" %
                                      probfile)
        pool = multiprocessing.Pool(2)
        seg_result = pool.apply_async(write_seg_volume,
                                      args=(h5file, seg_volume, "stack"))
        prob_result = pool.apply_async(write_prob_volume,
                                       args=(prob_volume, additional_maps,
                                             probfile, "volume/predictions"))
        pool.close()
        pool.join()
        seg_result.get()
        prob_result.get()
        outfile = tempfile.mktemp(".h5")
        rh_logger.logger.report_event("Neuroproof output: %s" % outfile)
        try:
            args = [
                self.neuroproof, h5file, probfile, self.classifier_filename,
                "--output-file", outfile, "--threshold",
                str(self.threshold), "--watershed-threshold",
                str(self.watershed_threshold)
            ]
            rh_logger.logger.report_event(" ".join(args))

            #
            # Inject the custom LD_LIBRARY_PATH into the subprocess environment
            #
            env = os.environ.copy()
            if "LD_LIBRARY_PATH" in env:
                ld_library_path = self.neuroproof_ld_library_path + os.pathsep +\
                    env["LD_LIBRARY_PATH"]
            else:
                ld_library_path = self.neuroproof_ld_library_path
            env["LD_LIBRARY_PATH"] = ld_library_path
            self.configure_env(env)
            #
            # Do the dirty deed...
            #
            subprocess.check_call(args, env=env, close_fds=True)
            #
            # There's an apparent bug in Neuroproof where it writes
            # the output to "fo.h5" for example, when you've asked it
            # to send the output to "foo.h5"
            #
            alt_outfile = os.path.splitext(outfile)[0][:-1] + ".h5"
            if (not os.path.exists(outfile)) and os.path.exists(alt_outfile):
                outfile = alt_outfile
            #
            # Finish the output volume
            #
            with h5py.File(outfile, "r") as fd:
                self.output().imwrite(fd["stack"][:].astype(np.uint32))
        finally:
            os.remove(h5file)
            os.remove(probfile)
            if os.path.exists(outfile):
                os.remove(outfile)

    def run_optimized(self):
        '''Run the MIT neuroproof'''
        #
        # The arguments for neuroproof_graph_predict:
        #
        output_target = self.output()
        output_target.create_directories()
        output = self.storage_plan
        #
        # neuroproof_graph_predict will take a .json file in place of a
        # prediction file. It has the following format:
        #
        # { "probabilities": [
        #       "<probability-loading-plan-1>",
        #       ...
        #       "<probability-loading-plan-N"
        #   ]
        #   "config": {
        #        "invert": [ True or False per probability ],
        #        "use-loading-plan": True,
        #        "use-storage-plan": True
        #   }
        #   "watershed": "watershed-loading-plan",
        #   "output": "output-storage-plan" }
        #
        # config is optional as are its key/value pairs. Predictably,
        # "invert" is False by default.
        #
        probabilities = \
            [self.prob_loading_plan_path] + \
            list(self.additional_loading_plan_paths)
        watershed = self.input_seg_loading_plan_path
        config_path = \
            os.path.splitext(self.classifier_filename)[0] + "_config.json"
        if os.path.isfile(config_path):
            config = json.load(open(config_path, "r"))
        else:
            config = {}
        config["use-loading-plans"] = True
        config["use-storage-plans"] = True
        d = dict(config=config,
                 probabilities=probabilities,
                 watershed=watershed,
                 output=output)
        fd, json_path = tempfile.mkstemp(".json")
        f = os.fdopen(fd, "w")
        json.dump(d, f)
        f.close()
        try:
            args = [
                self.neuroproof, "--threshold",
                str(self.threshold), "--watershed-threshold",
                str(self.watershed_threshold), json_path, json_path,
                self.classifier_filename
            ]
            #
            # Inject the custom LD_LIBRARY_PATH into the subprocess environment
            #
            env = os.environ.copy()
            if "LD_LIBRARY_PATH" in env:
                ld_library_path = self.neuroproof_ld_library_path + os.pathsep +\
                    env["LD_LIBRARY_PATH"]
            else:
                ld_library_path = self.neuroproof_ld_library_path
            env["LD_LIBRARY_PATH"] = ld_library_path
            self.configure_env(env)
            #
            # Do the dirty deed...
            #
            subprocess.check_call(args, env=env, close_fds=True)
            #
            # Finish the output volume
            #
            # We collect some summary statistics here that are added to
            # the JSON file.
            #
            data = output_target.imread()
            d = json.load(open(output_target.storage_plan_path))
            areas = np.bincount(data.ravel())
            areas[0] = 0
            labels = np.where(areas > 0)[0]
            areas = areas[labels]
            d["areas"] = areas.tolist()
            d["labels"] = labels.tolist()
            with output_target.open("w") as fd:
                json.dump(d, fd)
        finally:
            os.remove(json_path)

    def run_fast(self):
        '''Run using Tim Kaler's speedup + NeuroProof_plan'''
        #
        # Make the target directories for the .tif files
        #
        output_target = self.output()
        output_target.create_directories()

        arguments = [
            self.neuroproof, "-watershed", self.input_seg_loading_plan_path,
            "-prediction", self.prob_loading_plan_path
        ]
        for path in self.additional_loading_plan_paths:
            arguments.append("-prediction")
            arguments.append(path)
        arguments += [
            "-classifier", self.classifier_filename, "-output",
            self.storage_plan, "-threshold",
            str(self.threshold), "-algorithm", "1", "-nomito",
            "-min_region_sz",
            str(self.watershed_threshold)
        ]
        rh_logger.logger.report_event("Executing %s" % (" ".join(arguments)))
        #
        # Inject the custom LD_LIBRARY_PATH into the subprocess environment
        #
        env = os.environ.copy()
        if "LD_LIBRARY_PATH" in env:
            ld_library_path = self.neuroproof_ld_library_path + os.pathsep +\
                env["LD_LIBRARY_PATH"]
        else:
            ld_library_path = self.neuroproof_ld_library_path
        env["LD_LIBRARY_PATH"] = ld_library_path
        self.configure_env(env)
        #
        # Do the dirty deed...
        #
        subprocess.check_call(arguments, env=env)
        #
        # Finish the output volume
        #
        # We collect some summary statistics here that are added to
        # the JSON file.
        #
        data = output_target.imread()
        d = json.load(open(output_target.storage_plan_path))
        areas = np.bincount(data.ravel())
        areas[0] = 0
        labels = np.where(areas > 0)[0]
        areas = areas[labels]
        d["areas"] = areas.tolist()
        d["labels"] = labels.tolist()
        with output_target.open("w") as fd:
            json.dump(d, fd)
コード例 #10
0
class Backtest(luigi.Task):
    trade_fn = luigi.EnumParameter(
        enum=TradeFunction,
        default=TradeFunction.b_cross
    )
    rand_seed = luigi.IntParameter(
        default=random.randrange(sys.maxsize)
    )

    def requires(self):
        return [BollingerBands()]

    def output(self):
        if self._is_stochastic():
            return luigi.LocalTarget(
                config.data_dir + "trading/backtest_{}/{:06d}.csv".format(
                    self.trade_fn,
                    self.rand_seed
                )
            )
        else:
            return luigi.LocalTarget(
                config.data_dir + "trading/backtest_{}.csv".format(
                    self.trade_fn
                )
            )

    def _is_stochastic(self):
        """Return true if selected trade_fn is stochastic"""
        return self.trade_fn in [TradeFunction.random]

    def run(self):
        random.seed(self.rand_seed)
        # Read input
        print(self.input()[0]["bollinger"].path)
        dta = pd.read_csv(
            self.input()[0]["bollinger"].path,
            usecols=[
                'Date(UTC)', 'Value', 'EMA', 'STD', 'Upper Band', 'Lower Band'
            ],
            parse_dates=['Date(UTC)'],
            converters={
                'Value': float,
                'EMA': float,
                # 'STD': float,
                # 'Upper Band': float,
                # 'Lower Band': float,
            },
        )

        assets_dta = self._get_backtest_assets(dta)
        # Write to CSV
        assets_dta.to_csv(self.output().path, index=False)

    def _get_backtest_assets(self, price_dta, skip_first_n=1):
        """
        parameters:
        -----------
        skip_first_n : int
            allows calculations to catch up. must be >= 1
        """
        assert skip_first_n > 0
        wallet = Wallet()
        assets = [{
            **wallet.asset_dict(),
            'date_time': price_dta['Date(UTC)'][0],
            'netHoldings': 0,
            'trade': 0,
        }]

        # Iterate over all rows, adding trade data
        # trades = pd.DataFrame(columns=['date_time', 'price', 'trade'])
        for index, row in price_dta.iterrows():
            if index < skip_first_n:
                continue
            # implied else
            trade_amt = trade_function_map[self.trade_fn.value](
                price=row['Value'],
                bollinger_lower=row['Lower Band'],
                bollinger_upper=row['Upper Band'],
                max_trade=floor(assets[-1]['btc']*0.5),
                eth_btc_ratio=(1+assets[-1]['eth']) / (1+assets[-1]['btc']),
            )
            # if (trade_amt != 0):
            #     trades = trades.append({
            #         "price": row['Value'],
            #         "trade": trade_amt
            #     }, ignore_index=True)
            # else:
            #     trades = trades.append([
            #         row['Date(UTC)'],
            #         row['Value'],
            #         0
            #     ])
        # trades.to_csv(self.output()["trades"].path, index=False)

            trade_penalty = .05
            if trade_amt != 0:
                wallet.trade(
                    {'btc': - trade_amt},
                    {'eth': - trade_amt / row['Value'] * (1 - trade_penalty)},
                )
            assets.append({
                "date_time": row['Date(UTC)'],
                **wallet.asset_dict(),
                "trade": trade_amt
            })

            # Convert values to exchange currency
            assets[-1]['eth'] = assets[-1]['eth'] * row['Value']

            # Calculate net value of holdings
            assets[-1]['netHoldings'] = (
                assets[-1]['btc'] +
                assets[-1]['eth']
            )

        # Convert List to DataFrame
        return pd.DataFrame(assets)
コード例 #11
0
class DerivedLabels2D(luigi.Task):
    """
    Produce 2D label array at a specific time from tracked objects with
    specific properties (these conditions are implemented for each `label_type`)
    """

    label_type = luigi.Parameter(default="newlyformed_singlecore_clouds")

    base_name = luigi.Parameter()
    time = NumpyDatetimeParameter()

    tracking_type = luigi.EnumParameter(enum=TrackingType)
    tracking_timestep_interval = luigi.ListParameter(default=[])
    offset_labels_by_gal_transform = luigi.BoolParameter(default=False)
    track_without_gal_transform = luigi.BoolParameter(default=False)

    def requires(self):
        tasks = {}

        kws = dict(
            base_name=self.base_name,
            time=self.time,
            tracking_type=self.tracking_type,
            offset_labels_by_gal_transform=self.offset_labels_by_gal_transform,
            track_without_gal_transform=self.track_without_gal_transform,
            tracking_timestep_interval=self.tracking_timestep_interval,
        )

        if self.label_type == "newlyformed_singlecore_clouds":
            tasks["labels"] = TrackingLabels2D(
                label_var="cloud",
                *kws,
            )
            tasks["object_type"] = TrackingVariable2D(
                var_name="object_type",
                *kws,
            )
            tasks["object_age"] = TrackingVariable2D(
                var_name="object_age",
                *kws,
            )

        return tasks

    def run(self):
        if self.label_type == "newlyformed_singlecore_clouds":
            da_labels = self.input()["labels"].open().fillna(0).astype(int)
            raise NotImplementedError(da_labels)
            da_labels_filtered = None

        Path(self.output().fn).parent.mkdir(exist_ok=True, parents=True)
        da_labels_filtered.to_netcdf(self.output().fn)

    def output(self):
        type_id = uclales_2d_tracking.TrackingType.make_identifier(
            self.tracking_type)
        if self.tracking_timestep_interval:
            interval_id = "tn{}_to_tn{}".format(
                *self.tracking_timestep_interval)
        else:
            interval_id = "__all__"

        name_parts = [
            self.var_name,
            f"of_{self.label_var}",
            f"tracked_{type_id}",
            interval_id,
            self.time.isoformat(),
        ]

        if self.dx:
            name_parts.insert(1, f"{self.dx}_{self.op}")
        else:
            name_parts.insert(1, self.op)

        if self.offset_labels_by_gal_transform:
            meta = _get_dataset_meta_info(self.base_name)
            u_gal, v_gal = meta["U_gal"]
            name_parts.append(f"go_labels_{u_gal}_{v_gal}")

        if self.track_without_gal_transform:
            name_parts.append("go_track")

        fn = f"{'.'.join(name_parts)}.nc"

        p = get_workdir(
        ) / self.base_name / "cross_sections" / "aggregated" / fn
        return XArrayTarget(str(p))
コード例 #12
0
class MergePredictionsPipeline(luigi.Task):
    task_namespace="ariadne_microns_pipeline"
    
    operation = luigi.EnumParameter(
        enum=MergeOperation,
        default=MergeOperation.Average,
        description="The operation to perform")
    invert = luigi.BoolParameter(
        description="Subtract the result from the maximum allowed value")
    connectivity_graph_path = luigi.Parameter(
        description="The location of the connectivity graph")
    input_dataset_names = luigi.ListParameter(
        description="The dataset names of the inputs to be merged.")
    output_dataset_name = luigi.Parameter(
        description="The dataset name of the outputs to be generated.")
    index_file_name = luigi.Parameter(
        description="The name of the index file containing the ouput "
        "dataset's loading and storage plans")
    
    def output(self):
        return luigi.LocalTarget(self.index_file_name)
    
    def requires(self):
        if not hasattr(self, "requirements"):
            try:
                rh_logger.logger.start_process(
                    "MergePredictions", "starting", [])
            except:
                pass
            self.compute_requirements()
        return self.requirements
    
    def compute_requirements(self):
        self.cg = ConnectivityGraph.load(open(self.connectivity_graph_path))
        #
        # Find the loading plans of the input channels
        #
        rh_logger.logger.report_event("Finding input channel loading plans")
        self.find_input_channel_loading_plans()
        #
        # Find the storage plans of the input channels. These get used
        # to write loading plans that match the storage plans and to
        # write storage plans for the output channel.
        #
        rh_logger.logger.report_event("Finding input channel storage plans")
        self.find_input_channel_storage_plans()
        #
        # Write the loading plans for the input channels
        #
        rh_logger.logger.report_event("Writing input channel loading plans")
        self.write_input_channel_loading_plans()
        #
        # Write the storage plans for the output channel
        #
        rh_logger.logger.report_event("Writing output channel storage plans")
        self.write_output_channel_storage_plans()
        #
        # Write the loading plans for the output channel
        #
        rh_logger.logger.report_event("Write output channel loading plans")
        self.write_output_loading_plans()
        #
        # Make the needed tasks
        #
        self.requirements = self.make_merge_tasks()
    
    def find_input_channel_loading_plans(self):
        self.input_channel_lps = dict(
            [(channel, {}) for channel in self.input_dataset_names])
        #
        # We get the input channel loading plans from the 
        # Neuroproof loading plans by hacking their names
        #
        for volume, location in self.cg.locations.items():
            location_dir = os.path.dirname(location)
            paths = glob.glob(os.path.join(location_dir, "*.loading.plan"))
            for channel in self.input_dataset_names:
                for path in paths:
                    if os.path.split(path)[1].startswith(channel):
                        self.input_channel_lps[channel][volume] = path
    
    def find_input_channel_storage_plans(self):
        self.input_channel_sps = dict(
            [(channel, {}) for channel in self.input_dataset_names])
        #
        # We enumerate all the storage plans in each loading plan
        #
        for channel in self.input_dataset_names:
            d = self.input_channel_sps[channel]
            for volume, lp in self.input_channel_lps[channel].items():
                for sp in DestVolumeReader(lp).get_source_targets():
                    d[to_hashable(sp.volume)] = sp.storage_plan_path
                    
        
    def write_input_channel_loading_plans(self):
        '''Write loading plans that mirror the input channel storage plans'''
        self.input_channel_block_lps = dict(
            [(channel, {}) for channel in self.input_dataset_names])
        for channel in self.input_dataset_names:
            d = self.input_channel_block_lps[channel]
            for volume, sp in self.input_channel_sps[channel].items():
                sp_dir = os.path.dirname(sp)
                lp_path = os.path.join(
                    sp_dir,
                    "%s_%d-%d_%d-%d_%d-%d.loading_plan" % 
                     (channel, volume.x, volume.x1, volume.y, volume.y1,
                      volume.z, volume.z1))
                d[volume] = lp_path
                storage_plan = SrcVolumeTarget(sp)
                storage_plan.write_loading_plan(lp_path)
    
    def write_output_channel_storage_plans(self):
        '''Write a storage plan for each block to be merged'''
        self.output_channel_storage_plans = {}
        #
        # Copy channel 0's storage plan
        #
        ch0 = self.input_dataset_names[0]
        for volume, sp in self.input_channel_sps[ch0].items():
            spd = json.load(open(sp))
            sp_dir, sp0_file = os.path.split(sp)
            sp_file = "%s_%d-%d_%d-%d_%d-%d.storage.plan" % (
                self.output_dataset_name, 
                spd["x"], spd["x"] + spd["dimensions"][2],
                spd["y"], spd["y"] + spd["dimensions"][1],
                spd["z"], spd["z"] + spd["dimensions"][0])
            sp_path = os.path.join(sp_dir, sp_file)
            spd["dataset_name"] = self.output_dataset_name
            blocks = spd["blocks"]
            spd["blocks"] = []
            for v, tif_path in blocks:
                tif_file = "%s_%d-%d_%d-%d_%d-%d.tif" % (
                    self.output_dataset_name, 
                    v["x"], v["x"] + v["width"],
                    v["y"], v["y"] + v["height"],
                    v["z"], v["z"] + v["depth"])
                tif_path = os.path.join(os.path.dirname(tif_path), tif_file)
                spd["blocks"].append((v, tif_path))
            json.dump(spd, open(sp_path, "w"))
            self.output_channel_storage_plans[volume] = sp_path
    
    def write_output_loading_plans(self):
        '''Write loading plans for the output channel based on the input lps'''
        
        self.output_channel_loading_plans = {}
        #
        # Copy channel 0's loading plans
        #
        ch0 = self.input_dataset_names[0]
        for volume, lp in self.input_channel_lps[ch0].items():
            lpd = json.load(open(lp))
            lp_dir, lp0_file = os.path.split(lp)
            lp_file = "%s_%d-%d_%d-%d_%d-%d.loading.plan" % (
                self.output_dataset_name, 
                lpd["x"], lpd["x"] + lpd["dimensions"][2],
                lpd["y"], lpd["y"] + lpd["dimensions"][1],
                lpd["z"], lpd["z"] + lpd["dimensions"][0])
            lp_path = os.path.join(lp_dir, lp_file)
            lpd["dataset_name"] = self.output_dataset_name
            blocks = lpd["blocks"]
            lpd["blocks"] = []
            for tif_path, v in blocks:
                tif_file = "%s_%d-%d_%d-%d_%d-%d.tif" % (
                    self.output_dataset_name, 
                    v["x"], v["x"] + v["width"],
                    v["y"], v["y"] + v["height"],
                    v["z"], v["z"] + v["depth"])
                tif_path = os.path.join(os.path.dirname(tif_path), tif_file)
                lpd["blocks"].append((tif_path, v))
            json.dump(lpd, open(lp_path, "w"))
            self.output_channel_loading_plans[volume] = lp_path
        
    def make_merge_tasks(self):
        '''Make one merge task per block'''
        tasks = []
        for volume, sp in self.output_channel_storage_plans.items():
            lps = [self.input_channel_block_lps[channel][volume]
                   for channel in self.input_dataset_names]
            task = MergePredictionsTask(
               storage_plan=sp,
               loading_plans=lps,
               operation=self.operation,
               invert=self.invert)
            tasks.append(task)
        return tasks
    
    def run(self):
        '''Make an index file with the details of the run'''
        d = dict(output_loading_plans=[], output_storage_plans=[])
        lists = []
        for channel in self.input_dataset_names:
            cd = d[channel] = {}
            for name in ("input_channel_loading_plans", 
                         "input_channel_block_loading_plans",
                         "input_channel_storage_plans"):
                d1 = cd[name] = []
                d2 = getattr(self, name)[channel]
                lists.append((d1, d2))
        lists.append(d["output_channel_loading_plans"], 
                     self.output_channel_loading_plans)
        lists.append(d["output_channel_storage_plans"],
                     self.output_channel_storage_plans)
        for l1, d2 in lists:
            for v, path in d2.items():
                l1.append((to_json_serializable(v), path))
        with self.output().open("w") as fd:
            json.dump(d, fd)
            
        
            
        
                
コード例 #13
0
class LinkwaglOutputs(luigi.Task):
    """
    Link all the multifile outputs from wagl into a single file.
    """

    level1 = luigi.Parameter()
    work_root = luigi.Parameter()
    granule = luigi.OptionalParameter(default='')
    acq_parser_hint = luigi.OptionalParameter(default='')
    workflow = luigi.EnumParameter(enum=Workflow)
    vertices = luigi.TupleParameter(default=(5, 5))
    pixel_quality = luigi.BoolParameter()
    method = luigi.EnumParameter(enum=Method, default=Method.SHEAR)
    dsm_fname = luigi.Parameter(significant=False)
    buffer_distance = luigi.FloatParameter(default=8000, significant=False)

    def requires(self):
        container = acquisitions(self.level1, self.acq_parser_hint)
        for group in container.supported_groups:
            kwargs = {
                'level1': self.level1,
                'work_root': self.work_root,
                'granule': self.granule,
                'group': group,
                'workflow': self.workflow,
                'vertices': self.vertices,
                'pixel_quality': self.pixel_quality,
                'method': self.method,
                'dsm_fname': self.dsm_fname,
                'buffer_distance': self.buffer_distance
            }
            yield DataStandardisation(**kwargs)

    def output(self):
        out_fname = pjoin(dirname(self.work_root),
                          '{}.h5'.format(self.granule))
        return luigi.LocalTarget(out_fname)

    def run(self):
        with self.output().temporary_path() as out_fname:
            for root, _, files in os.walk(self.work_root):
                # skip any private files
                if basename(root)[0] == '_':
                    continue

                for file_ in files:
                    if splitext(file_)[1] == '.h5':
                        fname = pjoin(root, file_)
                        grp_name = basename(
                            dirname(fname.replace(self.work_root, '')))

                        with h5py.File(fname, 'r') as fid:
                            groups = [g for g in fid]

                        for pth in groups:
                            new_path = ppjoin(self.granule, grp_name, pth)
                            create_external_link(fname, pth, out_fname,
                                                 new_path)

            with h5py.File(out_fname) as fid:
                fid.attrs['level1_uri'] = self.level1
コード例 #14
0
class WriteTp5(luigi.Task):
    """Output the `tp5` formatted files."""

    level1 = luigi.Parameter()
    work_root = luigi.Parameter(significant=False)
    granule = luigi.OptionalParameter(default='')
    vertices = luigi.TupleParameter()
    acq_parser_hint = luigi.OptionalParameter(default='')
    workflow = luigi.EnumParameter(enum=Workflow)
    base_dir = luigi.Parameter(default='_atmospherics', significant=False)
    compression = luigi.EnumParameter(enum=H5CompressionFilter,
                                      default=H5CompressionFilter.LZF,
                                      significant=False)
    filter_opts = luigi.DictParameter(default=None, significant=False)

    def requires(self):
        container = acquisitions(self.level1, self.acq_parser_hint)
        tasks = {}

        tasks['ancillary'] = AncillaryData(self.level1, self.work_root,
                                           self.granule, self.vertices,
                                           self.workflow)

        for group in container.supported_groups:
            args = [self.level1, self.work_root, self.granule, group]
            tsks = {
                'sat_sol': CalculateSatelliteAndSolarGrids(*args),
                'lon_lat': CalculateLonLatGrids(*args)
            }
            tasks[group] = tsks

        return tasks

    def output(self):
        out_fname = pjoin(self.work_root, 'atmospheric-inputs.h5')
        return luigi.LocalTarget(out_fname)

    def run(self):
        container = acquisitions(self.level1, self.acq_parser_hint)
        acqs, group = container.get_highest_resolution(granule=self.granule)

        # output filename format
        output_fmt = pjoin(POINT_FMT, ALBEDO_FMT,
                           ''.join([POINT_ALBEDO_FMT, '.tp5']))

        # input filenames
        ancillary_fname = self.input()['ancillary'].path
        sat_sol_fname = self.input()[group]['sat_sol'].path
        lon_lat_fname = self.input()[group]['lon_lat'].path

        with self.output().temporary_path() as out_fname:
            tp5_data = _format_tp5(acqs, sat_sol_fname, lon_lat_fname,
                                   ancillary_fname, out_fname, self.workflow)

            # keep this as an indented block, that way the target will remain
            # atomic and be moved upon closing
            for key in tp5_data:
                point, albedo = key
                tp5_fname = output_fmt.format(p=point, a=albedo.value)
                target = pjoin(dirname(out_fname), self.base_dir, tp5_fname)
                with luigi.LocalTarget(target).open('w') as src:
                    src.writelines(tp5_data[key])
コード例 #15
0
class FindSeedsRunMixin(DatasetMixin):

    dimensionality = luigi.EnumParameter(
        enum=Dimensionality,
        description="Whether to find seeds in each 2D plane or in the "
        "volume as a whole")
    method = luigi.EnumParameter(
        enum=SeedsMethodEnum, description="The algorithm used to find seeds")
    sigma_xy = luigi.FloatParameter(
        description=
        "The sigma of the smoothing Gaussian in the x & y directions",
        default=3)
    sigma_z = luigi.FloatParameter(
        description="The sigma of the smoothing Gaussian in the z direction",
        default=.4)
    threshold = luigi.FloatParameter(
        description="The intensity threshold cutoff for the seeds", default=1)
    minimum_distance_xy = luigi.FloatParameter(
        default=5, description="The minimum distance allowed between seeds")
    minimum_distance_z = luigi.FloatParameter(
        default=1.5,
        description="The minimum distance allowed between seed in the z dir")
    structuring_element = luigi.EnumParameter(
        enum=Shape,
        default=Shape.Cube,
        description="The shape of the structuring element."
        " Ellipsoid is slower, but honors the distances."
        " Cube is faster, but excludes due to extrema at the corners of "
        "the cube")
    distance_threshold = luigi.FloatParameter(
        default=20,
        description="The distance threshold cutoff for the seeds in nm")
    #
    # Parameters for block management of the distance threshold calculation
    #
    xy_nm = luigi.FloatParameter(
        default=4.0, description="Size of a voxel in the X and Y direction")
    z_nm = luigi.FloatParameter(
        default=30.0, description="Size of a voxel in the Z direction")
    dt_xy_overlap = luigi.IntParameter(
        default=40,
        description="Overlap between distance transform blocks in the x and y "
        "directions")
    dt_z_overlap = luigi.IntParameter(
        default=5,
        description="Overlap between distance transform blocks in the z "
        "direction")
    dt_xy_block_size = luigi.IntParameter(
        default=512,
        description="Block size in the x and y directions for the distance "
        "transform.")
    dt_z_block_size = luigi.IntParameter(
        default=40,
        description="Block size in the z direction for the distance transform")
    dt_n_cpus = luigi.IntParameter(
        default=4,
        description="Number of CPUs to use when computing the distance "
        "transform")

    def make_strel(self):
        '''make the structuring element for the minimum distance'''
        if self.structuring_element == Shape.Cube:
            return np.ones([
                int(np.floor(_) * 2 + 1) for _ in self.minimum_distance_z,
                self.minimum_distance_xy, self.minimum_distance_xy
            ], bool)

        ixy = int(np.floor(self.minimum_distance_xy))
        iz = int(np.floor(self.minimum_distance_z))
        z, y, x = np.mgrid[-iz:iz + 1, -ixy:ixy + 1,
                           -ixy:ixy + 1].astype(np.float32)
        strel = ((z / self.minimum_distance_z)**2 +
                 (y / self.minimum_distance_xy)**2 +
                 (x / self.minimum_distance_xy)**2) <= 1
        return strel

    def find_using_2d_smoothing(self, probs):
        '''Find seeds in each plane, smoothing, then thresholding
        
        :param probs: the probability volume
        '''
        offset = 0
        seeds = []
        for plane in probs.astype(np.float32):
            smoothed = gaussian_filter(plane.astype(np.float32), self.sigma_xy)
            size = self.minimum_distance_xy
            eroded = grey_erosion(smoothed, size)
            thresholded = (smoothed < self.threshold) & (smoothed == eroded)
            labels, count = label(thresholded)
            labels[labels != 0] += offset
            offset += count
            seeds.append(labels)
        return np.array(seeds)

    def find_using_3d_smoothing(self, probs):
        '''Find seeds after smoothing and thresholding

        :param probs: the probability volume
        '''
        sigma = (self.sigma_z, self.sigma_xy, self.sigma_xy)
        smoothed = gaussian_filter(probs.astype(np.float32), sigma)
        eroded = grey_erosion(smoothed, footprint=self.make_strel())
        thresholded = (smoothed < self.threshold) & (smoothed == eroded)
        labels, count = label(thresholded)
        rh_logger.logger.report_event("Found %d seeds" % count)
        return labels

    def find_using_2d_distance(self, probs):
        '''Find seeds in each plane by distance transform

        :param probs: the probability volume
        '''
        offset = 0
        seeds = []
        for plane in probs.astype(np.float32):
            thresholded = plane < self.threshold
            distance = distance_transform_edt(thresholded)
            dilated = grey_dilation(distance, size=self.minimum_distance_xy)
            mask = (distance
                    == dilated) & (distance >= self.distance_threshold)
            labels, count = label(mask)
            labels[labels != 0] += offset
            offset += count
            seeds.append(labels)
        return np.array(seeds)

    def find_using_3d_distance(self, probs):
        distance = []
        thresholded = probs < self.threshold
        distance = parallel_distance_transform(thresholded, self.xy_nm,
                                               self.z_nm, self.dt_xy_overlap,
                                               self.dt_z_overlap,
                                               self.dt_xy_block_size,
                                               self.dt_z_block_size,
                                               self.dt_n_cpus)
        dilated = grey_dilation(distance, footprint=self.make_strel())
        mask = (distance == dilated) & (distance >= self.distance_threshold)
        labels, count = label(mask)
        rh_logger.logger.report_event("Found %d seeds" % count)
        return labels

    def ariadne_run(self):
        prob_target = DestVolumeReader(self.prob_loading_plan_path)
        probs = prob_target.imread()
        if self.method == SeedsMethodEnum.Smoothing:
            if self.dimensionality == Dimensionality.D2:
                seeds = self.find_using_2d_smoothing(probs)
            else:
                seeds = self.find_using_3d_smoothing(probs)
        else:
            if self.dimensionality == Dimensionality.D2:
                seeds = self.find_using_2d_distance(probs)
            else:
                seeds = self.find_using_3d_distance(probs)
        seeds = seeds.astype(np.uint32)
        self.output().imwrite(seeds)