Esempio n. 1
0
def remove_low_counts(data, method="sum", threshold=50, target_identifier=""):
    params = cc.CleanParameters("umc")
    params.min_sum_count = threshold
    report = cc.clean_gene_count_file(data, params)

    dataset = dr.create_data_set(report.dataset_identifier, "genes low counts removed", report.dataset_location)
    print(dataset.added_on)
    return dataset
Esempio n. 2
0
def remove_non_expressed_genes(data, source_identifier, custom_prefix="no_outliers_"):

    new_set = data[-(data.iloc[:, 1:].sum(1) == 0)]

    target_name = custom_prefix + source_identifier
    location = APP_CONFIG["application_base_location"] + target_name + ".txt"

    created = dr.create_data_set(identifier=target_name, type=dr.ACTION_TYPES["zcounts"], location=location)

    new_set.to_csv(location, index=False, sep="\t")

    return created
Esempio n. 3
0
def normalize_gene_counts(data, method, threshold=None, target_identifier="", experiment_identifier=""):
    # if exists -> get from db
    processing_result = cn.bioconductor_normalization(data, method)

    new_intern_identifier = identifier_generator.get_generated_guid_as_string()
    hdf_storage.store_hdf(processing_result.frame, new_intern_identifier)
    print("compressed pytable storage done")
    package = mr.get_package_by_name_and_version(processing_result.package,
                                                 processing_result.version).public_identifier
    dataset = dr.create_data_set(new_intern_identifier, package + "_" + target_identifier, "genes normalized dataset",
                                 package_identifier=package,
                                 experiment_identifier=experiment_identifier)
    print("new dataset saved")
    #er.link_dataset_to_experiment(experiment_identifier, dataset.public_identifier)
    print("new dataset linked")
    return dataset
    def post(self):
        args = dataset_parser.parse_args()
        print("dataset post received")
        # upload new dataset
        if args.dataset_identifier is None:
            print("creating new data set")
            new_file = args.file

            filename = secure_filename(new_file.filename)
            print("uploaded" + filename)
            intern_identifier = ig.get_generated_guid_as_string()
            new_file.save(filename)

            try:
                with open(filename, "rb") as fl:
                    frame = dr.get_data_frame_from_csv(fl)

                    intern_location = dr.store_data_frame_to_hdf(frame, intern_identifier)
                    # raw counts by default
                    data_entity = dr.create_data_set(intern_identifier,
                                                     public_identifier=filename,
                                                     dataset_type="raw gene counts",
                                                     experiment_identifier="raw_data_container")  # For demo -> add exp identifier
                    server_hash = ig.md5_for_file(fl)
                print(filename + " is saved")

                print("file removed")
                return cr.JsonResource(
                    {"filename": filename, "intern_identifier": data_entity.intern_identifier,
                     "public_identifier": data_entity.public_identifier, "server_md5": server_hash}), 201
            except IntegrityError:
                return cr.StringApiResource("Public identifier already taken"), 409
            except:
                return cr.StringApiResource("An error has occured, check if your data set comply with the expected format"), 400
            finally:
                if filename is not None:
                    os.remove(filename)
        else:
            # a new dataset is created based on source
            try:
                print("pre-processing existing dataset")
                source_data = args.dataset_identifier
                print("source data : " + source_data)
                # target_data = args.target_dataset_identifier
                # TODO link to experiment identifier!
                # experiment_identifier = args.experiment_identifier
                method_identifier = args.preprocessing_method_identifier

                print("source " + source_data + "method " + method_identifier)

                path = APP_CONFIG["application_files_location"] + APP_CONFIG["application_store_name"]
                df = dr.get_data_frame_from_hdf(source_data, path)
                print("data frame is loaded")
                new_data_set = ed.normalize_gene_counts(df, method_identifier,
                                                        target_identifier=source_data,
                                                        experiment_identifier=args.experiment_identifier)

                return cr.JsonResource(eto.SummaryDatasetView(new_data_set).to_json()), 201
            except Exception as e:
                print(e.__str__())
                return cr.StringApiResource("Explosion! Tool down..."), 400