Exemple #1
0
def merge_geog():
    """
    Choose best precision between initial coordinates
    or geocoded coordinates if geog is not set from
    cadastre information
    """

    # Input dataset
    basol_geocoded = Dataset("etl", "basol_normalized")

    # Output dataset
    basol_geog_merged = Dataset("etl", "basol_geog_merged")

    basol_geog_merged.write_dtype([
        *basol_geocoded.read_dtype(),
        Column("geog", Geometry(srid=4326)),
        Column("geog_precision", String),
        Column("geog_source", String)
    ])

    BasolGeocoded = basol_geocoded.reflect()

    session = basol_geocoded.get_session()

    point_lambert2 = func.ST_Transform(
        func.ST_setSRID(
            func.ST_MakePoint(BasolGeocoded.coordxlambertii,
                              BasolGeocoded.coordylambertii), LAMBERT2), WGS84)

    point_geocoded = func.ST_setSRID(
        func.ST_MakePoint(BasolGeocoded.geocoded_longitude,
                          BasolGeocoded.geocoded_latitude), WGS84)

    q = session.query(BasolGeocoded, point_lambert2, point_geocoded).all()

    with basol_geog_merged.get_writer() as writer:

        for (row, point_lambert2, point_geocoded) in q:

            output_row = {
                **row2dict(row), "geog": None,
                "geog_precision": None,
                "geog_source": None
            }

            if row.l2e_precision == precisions.HOUSENUMBER:

                output_row["geog"] = point_lambert2
                output_row["geog_precision"] = row.l2e_precision
                output_row["geog_source"] = "lambert2"

            elif (row.geocoded_result_type == precisions.HOUSENUMBER) and \
                 (row.geocoded_result_score >= 0.6):
                output_row["geog"] = point_geocoded
                output_row["geog_precision"] = row.geocoded_result_type
                output_row["geog_source"] = "geocodage"

            writer.write_row_dict(output_row)

    session.close()
Exemple #2
0
def generate_folds_for_dataset():
    dataset_names = Dataset.get_dataset_names() + Dataset.interesting_2d_datasets()

    for dataset_name in dataset_names:

        dataset = Dataset(dataset_name)
        print("making folds for dataset ", dataset_name)
        os.makedirs(os.path.join(FOLD_PATH, dataset_name), exist_ok=True)
        for run_nb in range(10):
            # toon's code
            # skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True)
            skf = StratifiedKFold(n_splits = 10, shuffle = True)
            # skf = KFold(n_splits=10, shuffle=True)
            labels = dataset.target

            for fold_nb, (train_indices, test_indices) in enumerate(skf.split(np.zeros(len(labels)), labels)):

                to_write = dict()
                to_write["train_indices"] = train_indices.tolist()
                to_write["test_indices"] = test_indices.tolist()
                if os.path.isfile(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb))):
                    print("fold file already exists! not overwriting!")
                    continue
                with open(os.path.join(FOLD_PATH, dataset_name, "run{}_fold{}.txt".format(run_nb, fold_nb)), mode = 'w') as fold_file:
                    json.dump(to_write, fold_file)
Exemple #3
0
    def process_data(self,
                     dataset: Dataset,
                     stage: Optional[str] = None) -> Dataset:
        src_text_column_name, tgt_text_column_name = self.source_target_column_names

        convert_to_features = partial(
            self.convert_to_features,
            tokenizer=self.tokenizer,
            padding=self.cfg.padding,
            max_source_length=self.cfg.max_source_length,
            max_target_length=self.cfg.max_target_length,
            src_text_column_name=src_text_column_name,
            tgt_text_column_name=tgt_text_column_name,
        )
        dataset = dataset.map(
            convert_to_features,
            batched=True,
            num_proc=self.cfg.preprocessing_num_workers,
            load_from_cache_file=self.cfg.load_from_cache_file,
        )

        cols_to_keep = [
            x for x in ["input_ids", "attention_mask", "labels"]
            if x in dataset["train"].features
        ]
        dataset.set_format(columns=cols_to_keep)
        return dataset
def prepare_inputs(ds: Dataset, text_col: str, label_col: str) -> Dataset:

    ds = ds.remove_columns(column_names=[text_col, "__index_level_0__"])
    ds = ds.rename_column(label_col, "labels")
    ds = ds.with_format("torch")

    return ds
def load_bottleneck_data(training_file, validation_file, breadth):
    """
    Utility function to load bottleneck features.

    Arguments:
        training_file - String
        validation_file - String
    """
    print("Training file", training_file)
    print("Validation file", validation_file)
    print("Output breadth", breadth)

    with open(training_file, 'rb') as f:
        train_data = pickle.load(f)
    with open(validation_file, 'rb') as f:
        validation_data = pickle.load(f)

    X_train = train_data['features']
    y_train = train_data['labels']
    X_val = validation_data['features']
    y_val = validation_data['labels']

    D_train = Dataset('Training', Data(X_train), Likelihoods(y_train, breadth))
    D_val = Dataset('Validation', Data(X_val), Likelihoods(y_val, breadth))

    return (D_train, D_val)
    def process_data(self,
                     dataset: Dataset,
                     stage: Optional[str] = None) -> Dataset:
        input_feature_fields = [
            k for k, v in dataset["train"].features.items()
            if k not in ["label", "idx"]
        ]
        dataset = TextClassificationDataModule.preprocess(
            dataset,
            tokenizer=self.tokenizer,
            input_feature_fields=input_feature_fields,
            padding=self.cfg.padding,
            truncation=self.cfg.truncation,
            max_length=self.cfg.max_length,
        )
        cols_to_keep = [
            x for x in
            ["input_ids", "attention_mask", "token_type_ids", "labels"]
            if x in dataset["train"].features
        ]
        if not isinstance(dataset["train"].features["labels"], ClassLabel):
            dataset = dataset.class_encode_column("labels")

        dataset.set_format("torch", columns=cols_to_keep)
        self.labels = dataset["train"].features["labels"]
        return dataset
Exemple #7
0
 def __init__(self, generator, document):
     Dataset.__init__(self, data=[])
     self.generator = generator
     self.document = document
     self.linked = None
     self._invalidpoints = None
     self.changeset = -1
Exemple #8
0
def concatenate_datasets_with_ratio(args, train_dataset):
    concatenate_list = []

    for sub_dataset_name, ratio in zip(
            args.data.sub_datasets.split(","),
            args.data.sub_datasets_ratio.split(",")):
        ratio = float(ratio)
        sub_dataset_path = p.join(args.path.train_data_dir, sub_dataset_name)
        assert p.exists(sub_dataset_path), f"{sub_dataset_name}이 존재하지 않습니다."

        sub_dataset = load_from_disk(sub_dataset_path)
        sub_dataset_len = int(len(sub_dataset["train"]) * ratio)

        print(f"ADD SUB DATASET {sub_dataset_name}, LENGTH: {sub_dataset_len}")

        # sub dataset must have same features: ['id', 'title', 'context', 'question', 'answers']
        features = sub_dataset["train"].features

        new_sub_dataset = sub_dataset["train"].select(range(sub_dataset_len))
        new_sub_dataset = Dataset.from_pandas(new_sub_dataset.to_pandas(),
                                              features=features)

        concatenate_list.append(new_sub_dataset.flatten_indices())

    train_dataset = Dataset.from_pandas(train_dataset.to_pandas(),
                                        features=features)
    train_dataset = concatenate_datasets([train_dataset.flatten_indices()] +
                                         concatenate_list)

    return train_dataset
    def process_data(self,
                     dataset: Dataset,
                     stage: Optional[str] = None) -> Dataset:
        features, label_column_name, text_column_name = self._setup_input_fields(
            dataset, stage)

        self._prepare_labels(dataset, features, label_column_name)

        convert_to_features = partial(
            TokenClassificationDataModule.convert_to_features,
            tokenizer=self.tokenizer,
            padding=self.cfg.padding,
            label_all_tokens=self.cfg.label_all_tokens,
            label_to_id=self.label_to_id,
            text_column_name=text_column_name,
            label_column_name=label_column_name,
        )
        dataset = dataset.map(
            convert_to_features,
            batched=True,
            num_proc=self.cfg.preprocessing_num_workers,
            load_from_cache_file=self.cfg.load_from_cache_file,
        )
        cols_to_keep = [
            x for x in
            ["input_ids", "attention_mask", "token_type_ids", "labels", "idx"]
            if x in dataset["train"].features
        ]
        dataset.set_format(columns=cols_to_keep)
        return dataset
Exemple #10
0
    def load_data(
        self,
        hf_dataset: Dataset,
        input_key: str,
        target_keys: Optional[Union[str, List[str]]] = None,
        target_formatter: Optional[TargetFormatter] = None,
    ) -> Dataset:
        """Loads data into HuggingFace datasets.Dataset."""
        if not self.predicting:
            hf_dataset = hf_dataset.map(
                partial(self._resolve_target, target_keys))
            targets = hf_dataset.to_dict()[DataKeys.TARGET]
            self.load_target_metadata(targets,
                                      target_formatter=target_formatter)

            # If we had binary multi-class targets then we also know the labels (column names)
            if isinstance(self.target_formatter,
                          MultiBinaryTargetFormatter) and isinstance(
                              target_keys, List):
                self.labels = target_keys

        # remove extra columns
        extra_columns = set(
            hf_dataset.column_names) - {input_key, DataKeys.TARGET}
        hf_dataset = hf_dataset.remove_columns(extra_columns)

        if input_key != DataKeys.INPUT:
            hf_dataset = hf_dataset.rename_column(input_key, DataKeys.INPUT)

        return hf_dataset
Exemple #11
0
 def __init__(self, generator, document):
     Dataset.__init__(self, data=[])
     self.generator = generator
     self.document = document
     self.linked = None
     self._invalidpoints = None
     self.changeset = -1
Exemple #12
0
def matlab_test():
    dataset = Dataset("iris")
    clusterer = MyCOSCMatlab()
    clusterer.signal_start(dataset.data)
    result = clusterer.fit(dataset.data, [(1,2),(2,3),(3,dataset.number_of_instances())], [(10,12),(23,16)], dataset.number_of_classes())
    print(result)
    clusterer.signal_end()
def load_datasets(lang="es", random_state=2021, preprocessing_args={}):
    """
    Load emotion recognition datasets
    """

    train_df = load_df(paths[lang]["train"])
    test_df = load_df(paths[lang]["test"])
    train_df, dev_df = train_test_split(train_df,
                                        stratify=train_df["label"],
                                        random_state=random_state)

    for df in [train_df, dev_df, test_df]:
        for label, idx in label2id.items():
            df.loc[df["label"] == label, "label"] = idx
        df["label"] = df["label"].astype(int)

    preprocess = lambda x: preprocess_tweet(x, lang=lang, **preprocessing_args)

    train_df.loc[:, "text"] = train_df["text"].apply(preprocess)
    dev_df.loc[:, "text"] = dev_df["text"].apply(preprocess)
    test_df.loc[:, "text"] = test_df["text"].apply(preprocess)

    features = Features({
        'text':
        Value('string'),
        'label':
        ClassLabel(num_classes=len(id2label),
                   names=[id2label[k] for k in sorted(id2label.keys())])
    })

    train_dataset = Dataset.from_pandas(train_df, features=features)
    dev_dataset = Dataset.from_pandas(dev_df, features=features)
    test_dataset = Dataset.from_pandas(test_df, features=features)

    return train_dataset, dev_dataset, test_dataset
    def run(self):
        algorithm = algorithm_info_to_object(self.algorithm_name, self.algorithm_parameters)
        querier_builder = querier_info_to_object(self.querier_name, self.querier_parameters)
        dataset = Dataset(self.dataset_name)
        train_indices = fold_path_to_train_indices(self.fold_path)
        querier = querier_builder.build_querier(dataset)
        result = None

        # retry to execute the algorithm 10 times
        # this is because COSC does not always produce a result and ends with an exception
        try:
            result = algorithm.fit(dataset.data, dataset.number_of_classes(), train_indices, querier)
        except Exception as e:
            print("An exception occured during calculation of {} (this is silently ignored):".format(self.result_path), file = sys.stderr)
            traceback.print_exc()

        if result is None:
            return

        # None is not json serializable so use the string "None" instead
        train_indices = train_indices if train_indices is not None else "None"
        full_result = result + (train_indices,)
        os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
        with open(self.result_path, mode="w") as result_file:
            json.dump(full_result, result_file)
Exemple #15
0
def main():
    """
    Main process.

    """
    args = parse_cli_args()
    config = TrainConfig()

    train_ds = Dataset(args.train_path)
    valid_ds = Dataset(args.valid_path)

    model = make_model()
    optimizer = getattr(optim, config.optimizer_name)(model.parameters(),
                                                      lr=config.learning_rate)

    training = Training(
        train_ds,
        valid_ds,
        model,
        optimizer,
        config.batch_size,
        config.epochs,
    )

    training.train()
Exemple #16
0
 def pretrain_RNADE(self,):
     print 'Pre-training the RNADE'
     l2 = 2.
     rnade = RNADE(self.n_visible,self.n_hidden,self.n_components,hidden_act=self.hidden_act,l2=l2)
     batch_size = 100
     num_examples = 100
     filename = 'pre_train_params.pickle'
     learning_rate = self.learning_rate_pretrain
     train_data = mocap_data.sample_train_seq(batch_size)
     for i in xrange(1,num_examples):
         train_data = numpy.vstack((train_data,mocap_data.sample_train_seq(batch_size)))
     numpy.random.shuffle(train_data)
     total_num = train_data.shape[0]
     train_frac = 0.8
     train_dataset = Dataset([train_data[0:int(train_frac*total_num)]],100)
     valid_dataset = Dataset([train_data[int(train_frac*total_num):]],100)
     optimiser = SGD_Optimiser(rnade.params,[rnade.v],[rnade.cost,rnade.ll_cost,rnade.l2_cost],momentum=True,patience=20,clip_gradients=self.clip_gradients)
     optimiser.train(train_dataset,valid_set=valid_dataset,learning_rate=learning_rate,num_epochs=5,save=True,
                 lr_update=True,update_type='linear',start=2,output_folder=self.output_folder,filename=filename)
     self.plot_costs(optimiser,fig_title='Pretraining cost',filename='pretraining.png')
     print 'Done pre-training.'
     ####load best params from pre-training###
     print 'Loading best RNADE parameters'
     rnade = RNADE(self.n_visible,self.n_hidden,self.n_components,hidden_act=self.hidden_act,l2=l2)
     rnade.load_model(self.output_folder,filename=filename)
     ###########
     for param in rnade.params:
         value = param.get_value()
         self.model.params_dict[param.name].set_value(value)
     print 'Done pre-training.'
     #Saving results to dict
     self.results['pretraining_train_costs'] = optimiser.train_costs
     self.results['pretraining_valid_costs'] = optimiser.valid_costs
Exemple #17
0
    def initialize(self, source, target, batch_size1, batch_size2, scale=32, shuffle_=False):
        transform = transforms.Compose([
                transforms.Resize(scale),
                transforms.ToTensor(),
                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

        dataset_source = Dataset(source['imgs'], source['labels'], transform=transform)
        dataset_target = Dataset(target['imgs'], target['labels'], transform=transform)

        data_loader_s = torch.utils.data.DataLoader(
            dataset_source,
            batch_size=batch_size1,
            shuffle=shuffle_,
            num_workers=4
        )

        data_loader_t = torch.utils.data.DataLoader(
            dataset_target,
            batch_size=batch_size2,
            shuffle=shuffle_,
            num_workers=4
        )

        self.dataset_s = dataset_source
        self.dataset_t = dataset_target
        self.paired_data = PairedData(data_loader_s, data_loader_t, float("inf"))
def save_data(train_df, val_df):
    train_f = Features({
        'answers':
        Sequence(feature={
            'text': Value(dtype='string', id=None),
            'answer_start': Value(dtype='int32', id=None)
        },
                 length=-1,
                 id=None),
        'context':
        Value(dtype='string', id=None),
        'id':
        Value(dtype='string', id=None),
        'question':
        Value(dtype='string', id=None),
        'question_type':
        Value(dtype='int32', id=None)
    })

    train_datasets = DatasetDict({
        'train':
        Dataset.from_pandas(train_df, features=train_f),
        'validation':
        Dataset.from_pandas(val_df, features=train_f)
    })
    file = open("../../data/question_type.pkl", "wb")
    pickle.dump(train_datasets, file)
    file.close()
Exemple #19
0
def test_dataset_with_image_feature_with_none():
    data = {"image": [None]}
    features = Features({"image": Image()})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"image"}
    assert item["image"] is None
    batch = dset[:1]
    assert len(batch) == 1
    assert batch.keys() == {"image"}
    assert isinstance(batch["image"], list) and all(item is None for item in batch["image"])
    column = dset["image"]
    assert len(column) == 1
    assert isinstance(column, list) and all(item is None for item in column)

    # nested tests

    data = {"images": [[None]]}
    features = Features({"images": Sequence(Image())})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"images"}
    assert all(i is None for i in item["images"])

    data = {"nested": [{"image": None}]}
    features = Features({"nested": {"image": Image()}})
    dset = Dataset.from_dict(data, features=features)
    item = dset[0]
    assert item.keys() == {"nested"}
    assert item["nested"].keys() == {"image"}
    assert item["nested"]["image"] is None
    def process_data(self,
                     dataset: Dataset,
                     stage: Optional[str] = None) -> Dataset:
        convert_to_features = partial(
            self.convert_to_features,
            tokenizer=self.tokenizer,
            context_name=self.context_name,
            choices=self.choices,
            question_header_name=self.question_header_name,
            answer_column_name=self.answer_column_name,
            options_column_name=self.options_column_name,
            max_length=self.cfg.max_length,
            padding=self.cfg.padding,
        )

        dataset = dataset.map(
            convert_to_features,
            batched=True,
            num_proc=self.cfg.preprocessing_num_workers,
            load_from_cache_file=self.cfg.load_from_cache_file,
        )

        cols_to_keep = [
            x for x in
            ["input_ids", "attention_mask", "token_type_ids", "label", "idx"]
            if x in dataset["train"].features
        ]
        dataset.set_format(columns=cols_to_keep)

        return dataset
    def process_data(self, dataset: Dataset, stage: Optional[str] = None) -> Dataset:
        column_names = dataset["train" if stage == "fit" else "validation"].column_names
        text_column_name = "text" if "text" in column_names else column_names[0]

        tokenize_function = partial(self.tokenize_function, tokenizer=self.tokenizer, text_column_name=text_column_name)

        dataset = dataset.map(
            tokenize_function,
            batched=True,
            num_proc=self.cfg.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=self.cfg.load_from_cache_file,
        )

        # Pass in our additional condition term when converting to features
        convert_to_features = partial(
            self.convert_to_features,
            block_size=self.effective_block_size,
            tokenized_condition_term=self.tokenized_condition_term,
        )

        dataset = dataset.map(
            convert_to_features,
            batched=True,
            num_proc=self.cfg.preprocessing_num_workers,
            load_from_cache_file=self.cfg.load_from_cache_file,
        )

        return dataset
Exemple #22
0
def _preprocess_dataset(
    dataset_name, data, sentence_col, tokenizer, cache_dir="", short_test=False
):
    preprocess_function = dataset_preprocess.get(dataset_name, lambda x: x)
    data = concate(dataset_name, data, cache_dir)

    data = data.map(lambda x: {"input_text": preprocess_function(x[sentence_col])})
    data["train"] = data["train"].remove_columns(
        set(data["train"].features) - set(["input_text"])
    )

    logging.info(f"NP Concate")
    if dataset_name == "air_dialogue":
        data["train"] = Dataset.from_dict(
            {"input_text": np.concatenate(data["train"]["input_text"]).ravel().tolist()}
        )

    if short_test:
        data["train"] = Dataset.from_dict(
            {"input_text": data["train"]["input_text"][:30]}
        )

    if dataset_name == "air_dialogue" or dataset_name == "yahoo_answers_topics":
        data["train"] = Dataset.from_dict(
            {"input_text": data["train"]["input_text"][:100000]}
        )
    elif dataset_name == "wikipedia" or dataset_name == "yelp_review_full":
        data["train"] = Dataset.from_dict(
            {"input_text": data["train"]["input_text"][:200000]}
        )

    if dataset_name in split_para:
        logging.info(f"Splitting Paragraphs")
        data["train"] = Dataset.from_dict(
            {"input_text": split_long_text(data["train"]["input_text"])}
        )

    logging.info(f"Normalize")
    data = data.map(lambda x: {"input_text": normalize_raw(x["input_text"])})
    logging.info(f"Keep Sentence")
    data = data.filter(lambda x: keep_sentence(x["input_text"]))

    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    filename = f"{dataset_name}-full-text.out"
    logging.info(f"Opening file {filename} to write results")
    with open(filename, "w") as outfile:
        outfile.write("FULL TEXT BELOW\n")
        for i, text in enumerate(data["train"]["input_text"]):
            outfile.write(f"{i} | {text}\n")

    logging.info(f"Join")
    data = data.map(lambda x: {"input_text": " ".join(x["input_text"])})
    logging.info(f"Tokenizer")
    data = data.map(
        lambda x: tokenizer(x["input_text"], padding="max_length", truncation=True),
        batched=True,
    )
    return data
    def process(self, dataset: datasets.Dataset):
        dataset = dataset.filter(self._filter_cmrc_data)
        if self.task == 'hl_ag':
            dataset = dataset.filter(self._filter_task_hl)
        else:
            dataset = dataset.filter(self._filter_task_qa)
        dataset = dataset.map(self._convert_to_features)

        return dataset
def remove_unused_columns(model: nn.Module, dataset: hf_datasets.Dataset) -> None:
    # This method is implemented in transformer's Trainer.
    # Inspect model forward signature to keep only the arguments it accepts.
    signature = inspect.signature(model.forward)
    signature_columns = list(signature.parameters.keys())
    # Labels may be named label or label_ids, the default data collator handles that.
    signature_columns += ["label", "label_ids"]
    columns = [k for k in signature_columns if k in dataset.column_names]
    dataset.set_format(type=dataset.format["type"], columns=columns)
 def preprocess(ds: Dataset, **fn_kwargs) -> Dataset:
     ds = ds.map(
         # todo: change this to self.convert_to_features for users to override
         TextClassificationDataModule.convert_to_features,
         batched=True,
         with_indices=True,
         fn_kwargs=fn_kwargs,
     )
     ds.rename_column_("label", "labels")
     return ds
Exemple #26
0
    def train(self, training_data, target_station, verbose=1):
        print(self.session_name)
        # print(training_data, target_station, self.vol_size, self.strides)

        dataset = Dataset(self._builder_class, training_data, target_station, self.vol_size, self.strides)
        (X_train, Y_train) = dataset.construct()

        model = self.model(summary=False)
        model.fit(X_train, Y_train, epochs=self.epochs, batch_size=self.batch_size, callbacks=self.callbacks(),
                  validation_split=0.05, verbose=verbose, shuffle=True)
Exemple #27
0
def geocode():
    """ Geocode Basol adresses """

    # input dataset
    basol_filtered = Dataset("etl", "basol_filtered")

    # output dataset
    basol_geocoded = Dataset("etl", "basol_geocoded")

    # write output schema
    dtype = basol_filtered.read_dtype(primary_key="numerobasol")

    output_dtype = [
        Column("id", BigInteger(), primary_key=True, autoincrement=True),
        *dtype,
        Column("geocoded_latitude", Float(precision=10)),
        Column("geocoded_longitude", Float(precision=10)),
        Column("geocoded_result_score", Float()),
        Column("geocoded_result_type", String()),
        Column("adresse_id", String())
    ]

    basol_geocoded.write_dtype(output_dtype)

    with basol_geocoded.get_writer() as writer:

        for df in basol_filtered.get_dataframes(chunksize=100):

            df = df.replace({np.nan: None})
            rows = df.to_dict(orient="records")
            payload = [{
                "adresse": row["adresse"],
                "code_insee": row["code_insee"]
            } for row in rows]

            geocoded = bulk_geocode(payload,
                                    columns=["adresse"],
                                    citycode="code_insee")

            zipped = list(zip(rows, geocoded))

            for (row, geocodage) in zipped:
                latitude = geocodage["latitude"]
                row["geocoded_latitude"] = float(latitude) \
                    if latitude else None
                longitude = geocodage["longitude"]
                row["geocoded_longitude"] = float(longitude) \
                    if longitude else None
                result_score = geocodage["result_score"]
                row["geocoded_result_score"] = float(result_score) \
                    if result_score else None
                row["geocoded_result_type"] = geocodage["result_type"]

                if row["geocoded_result_type"] == precisions.HOUSENUMBER and \
                   row["geocoded_result_score"] > 0.6:
                    row["adresse_id"] = geocodage["result_id"]
                else:
                    row["adresse_id"] = None

                writer.write_row_dict(row)
Exemple #28
0
def filter_departements():
    """
    Filtre les données pour conserver uniquement
    les enregistrements localisés dans les
    départements sélectionnés dans la config
    """

    # Input dataset
    sis_source = Dataset("etl", "sis_source")

    # output dataset
    sis_filtered = Dataset("etl", "sis_filtered")

    sis_filtered.write_dtype(sis_source.read_dtype())

    with sis_filtered.get_writer() as writer:
        for row in sis_source.iter_rows():
            code_insee = row["code_insee"]
            keep_row = False
            for departement in DEPARTEMENTS:
                if code_insee.startswith(departement):
                    keep_row = True
                    break
            if keep_row:
                writer.write_row_dict(row)
Exemple #29
0
def prepare_sites():
    """
    Cette recette ajoute une clé primaire
    et garde uniquement certaines colonnes
    """

    # input dataset
    basias_sites_filtered = Dataset("etl", "basias_sites_filtered")

    # output dataset
    basias_sites_prepared = Dataset("etl", "basias_sites_prepared")

    # columns to keep
    keep = ["indice_departemental", "nom_usuel", "raison_sociale"]

    dtype = basias_sites_filtered.read_dtype()

    # transform schema
    output_dtype = [column for column in dtype if column.name in keep]

    id_column = Column("id", BigInteger, primary_key=True, autoincrement=True)

    output_dtype = [id_column, *output_dtype]

    basias_sites_prepared.write_dtype(output_dtype)

    # transform data
    with basias_sites_prepared.get_writer() as writer:
        for row in basias_sites_filtered.iter_rows():
            output_row = dict((key, row[key]) for key in row if key in keep)
            writer.write_row_dict(output_row)
def normalize_precision():
    """
    Cette recette permet de normaliser les valeurs
    de la colonne lib_precis dans la nomenclature
    PARCEL, HOUSENUMBER, MUNICIPALITY
    """

    # input dataset
    s3ic_geocoded = Dataset("etl", "s3ic_geocoded")

    # output dataset
    s3ic_normalized = Dataset("etl", "s3ic_normalized")

    dtype = s3ic_geocoded.read_dtype()
    s3ic_normalized.write_dtype(dtype)

    with s3ic_normalized.get_writer() as writer:

        for row in s3ic_geocoded.iter_rows():

            mapping = {
                "Coordonnées précises": precisions.PARCEL,
                "Coordonnée précise": precisions.PARCEL,
                "Valeur Initiale": precisions.PARCEL,
                "Adresse postale": precisions.HOUSENUMBER,
                "Centroïde Commune": precisions.MUNICIPALITY,
                "Inconnu": precisions.MUNICIPALITY
            }
            precision = row.get("precision")
            if precision:
                row["precision"] = mapping.get(precision)
            else:
                row["precision"] = precisions.MUNICIPALITY

            writer.write_row_dict(row)
Exemple #31
0
def merge_cadastre():
    """ Merge the different parcelles into a MultiPolygon """

    # Input dataset
    basol_cadastre_joined = Dataset("etl", "basol_cadastre_joined")

    # Output dataset
    basol_cadastre_merged = Dataset("etl", "basol_cadastre_merged")

    dtype = [
        Column("id", BigInteger, primary_key=True, autoincrement=True),
        Column("numerobasol", String),
        Column("geog", Geometry(srid=4326))
    ]
    basol_cadastre_merged.write_dtype(dtype)

    BasolCadastreJoined = basol_cadastre_joined.reflect()

    session = basol_cadastre_joined.get_session()

    select = [
        BasolCadastreJoined.numerobasol,
        func.st_multi(func.st_union(BasolCadastreJoined.geog))
    ]

    q = session.query(*select) \
               .group_by(BasolCadastreJoined.numerobasol) \
               .all()

    with basol_cadastre_merged.get_writer() as writer:
        for (numerobasol, geog) in q:
            row = {"numerobasol": numerobasol, "geog": geog}
            writer.write_row_dict(row)

    session.close()
Exemple #32
0
def prepare_solution():
    train = Dataset.from_train()
    X = train.get_features()
    Y = train.get_labels()
    rf = RandomForestRegressor(n_jobs=-1)
    model = rf.fit(X, Y)
    print('Train score: %f' % loss(Y, model.predict(X)))
    test = Dataset.from_test()
    X2 = test.get_features()
    Y2 = model.predict(X2)
    save_predictions(Y2, test)
 def predict(self, seq):
     result = {}
     self.predict_data_ = seq
     for label, value in self.gmmhmms.iteritems():
         gmmhmm = value['gmmhmm']
         status_set = value['status_set']
         d = Dataset(motion_type=status_set['motion'], sound_type=status_set['sound'],
                     location_type=status_set['location'])
         seq_converted = np.array(d._convetNumericalSequence(seq))
         result[label] = gmmhmm.score(seq_converted)
     return result
def classifyByGMMHMM(seq, models, configs):

    Y = []
    for config in configs:
        _rawdata_type = config["logType"]
        _event_type = config["eventType"]
        _motion_type = config["motionType"]
        _sound_type = config["soundType"]
        _location_type = config["locationType"]

        d = Dataset(
            rawdata_type=_rawdata_type,
            event_type=_event_type,
            motion_type=_motion_type,
            sound_type=_sound_type,
            location_type=_location_type
        )
        # Initiation of data need prediction.
        y = np.array(d._convetNumericalSequence(seq))
        Y.append(y)


    _GMMHMMs = []
    for model in models:
        _GMMs = []
        for gmm in model["gmmParams"]["params"]:
            _GMM = GMM(
                n_components=model["nMix"],
                covariance_type=model["covarianceType"]
            )
            _GMM.covars_  = np.array(gmm["covars"])
            _GMM.means_   = np.array(gmm["means"])
            _GMM.weights_ = np.array(gmm["weights"])
            _GMMs.append(_GMM)
        _GMMHMM = GMMHMM(
            n_components=model["nComponent"],
            n_mix=model["nMix"],
            startprob=np.array(model["hmmParams"]["startProb"]),
            transmat=np.array(model["hmmParams"]["transMat"]),
            gmms=_GMMs,
            covariance_type=model["covarianceType"]
        )
        _GMMHMMs.append(_GMMHMM)

    results = []
    # for _GMMHMM in _GMMHMMs:
        # res = _GMMHMM.score(Y)
        # results.append(res)
    for i in range(0, len(models)):
        res = _GMMHMMs[i].score(Y[i])
        results.append(res)

    return results
Exemple #35
0
def submission():
    print('Cross validate K-Means model')
    train = Dataset.from_train()
    test = Dataset.from_test()
    X = train.get_features()
    Y = train.get_labels()
    X2 = test.get_features()
    kmeans = KMeans(n_clusters=8)
    clf = kmeans.fit(X, train.get_multi_labels())
    score = check_score(Y, to_labels(clf.predict(X)))
    print("Train dataset score %f" % (score/len(X)))
    Y2 = to_labels(clf.predict(X2))
    save_predictions(Y2, test.df)
Exemple #36
0
def submission():
    print('Cross validate bayes model')
    train = Dataset.from_train()
    test = Dataset.from_test()
    X = train.get_features()
    Y = train.get_labels()
    X2 = test.get_features()
    gnb = bayes.MultinomialNB()
    clf = gnb.fit(X, train.get_multi_labels())
    score = check_score(Y, to_labels(clf.predict(X)))
    print("Train dataset score %f" % (score/len(X)))
    Y2 = to_labels(clf.predict(X2))
    save_predictions(Y2, test.df)
Exemple #37
0
def main():
    print('Explore dataset')
    train = Dataset.from_train()
    u = train.pca()
    print('U shape: ' + str(u.shape))
    X = train.get_pca_features(u)
    print(X.shape)
Exemple #38
0
def make_submission(network, params, u):
    print('Prepare submission')
    test = Dataset.from_test()
    if params.pca:
        X2 = test.get_pca_features(u)
    else:
        X2 = test.get_features()
    predictions = network.predict(X2)
    save_predictions(predictions, test.df)
Exemple #39
0
def train_nn(restore):
    print('Training neural net')
    encoder = AutoEncoder()
    encoder.restore_session()
    train_data = Dataset.from_train()
    X = encoder.encode(train_data.get_features())
    y = train_data.get_labels()
    nn = NeuralNet()
    nn.fit(X, y)
Exemple #40
0
def prepare_submission(params):
    print('Prepare submission with params')
    print(params)
    network = NeuralNetwork(params)
    train = Dataset.from_train()
    u = train.pca()
    if params.pca:
        X = train.get_pca_features(u)
    else:
        X = train.get_features()
    Y = train.get_labels()
    network.fit(X, Y)
    score = network.check_score(X, Y)
    print('Train dataset score %f' % (score/len(X)))
    make_submission(network, params, u)
trans_mat_prior = np.array([[0.2, 0.1, 0.3, 0.4],
                            [0.3, 0.2, 0.2, 0.3],
                            [0.1, 0.1, 0.1, 0.7],
                            [0.1, 0.3, 0.4, 0.2]])

# Build an HMM instance and set parameters
model_dining  = GMMHMM(startprob_prior=start_prob_prior, transmat_prior=trans_mat_prior, startprob=start_prob, transmat=trans_mat, n_components=4, n_mix=4, covariance_type='spherical', n_iter=50)
model_fitness = GMMHMM(startprob_prior=start_prob_prior, transmat_prior=trans_mat_prior, startprob=start_prob, transmat=trans_mat, n_components=4, n_mix=10, covariance_type='spherical', n_iter=50)
model_work    = GMMHMM(startprob_prior=start_prob_prior, transmat_prior=trans_mat_prior, startprob=start_prob, transmat=trans_mat, n_components=4, n_mix=8, covariance_type='spherical', n_iter=50)
model_shop    = GMMHMM(startprob_prior=start_prob_prior, transmat_prior=trans_mat_prior, startprob=start_prob, transmat=trans_mat, n_components=4, n_mix=4, covariance_type='spherical', n_iter=50)

# print model_dining.gmms_[0].covars_.tolist()
# print model_dining.gmms_[0].means_.tolist()
# print model_dining.gmms_[0].weights_.tolist()

dataset_dining  = Dataset()
dataset_fitness = Dataset()
dataset_work    = Dataset()
dataset_shop    = Dataset()

# print Dataset().randomObservations('dining_out_in_chinese_restaurant', 10, 10).obs

D = dataset_dining.randomObservations('dining.chineseRestaurant', 10, 300).getDataset()
F = dataset_fitness.randomObservations('fitness.running', 10, 300).getDataset()
W = dataset_work.randomObservations('work.office', 10, 300).getDataset()
S = dataset_shop.randomObservations('shopping.mall', 10, 300).getDataset()
# dataset_dining.plotObservations3D()

# D = Dataset(obs_dining).dataset
# F = Dataset(obs_fitness).dataset
# W = Dataset(obs_work).dataset
                'transMat': self.gmmhmm.transmat_.tolist(),
                'transMatPrior': self.gmmhmm.transmat_prior.tolist(),
                'startProb': self.gmmhmm.startprob_.tolist(),
                'startProbPrior': self.gmmhmm.startprob_prior.tolist(),
            },
            'gmmParams': {
                'nMix': self.gmmhmm.n_mix,
                'covarianceType': self.gmmhmm.covariance_type,
                'gmms': gmms_,
            }
        }


if __name__ == '__main__':
    from datasets import Dataset
    d = Dataset()
    d.randomObservations("dining#chineseRestaurant", 10, 10)

    _model = {
        "hmmParams": {
            "transMat": [
                [
                    0.2,
                    0.1,
                    0.3,
                    0.4
                ],
                [
                    0.3,
                    0.2,
                    0.2,
Exemple #43
0
def train_auto_encoder(restore):
    print('Training auto encoder')
    network = AutoEncoder()
    train_data = Dataset.from_train()
    test_data = Dataset.from_test()
    network.fit_encoder(train_data, test_data, restore=restore)