Beispiel #1
0
class GenerateSilicoData():
    def __init__(self, config_filename):
        self.logger = Utils.get_logger('generate_and_store_silico')
        config_raw = Utils.get_config(config_filename, "SilicoData")
        self.config = GenerateSilicoConfig(**config_raw)
        self.store = Storage(config_filename)
        self.generator_manager = SilicoGeneratorsManager(config_filename)
        self.generator_manager.setup()

    def run(self):
        self.logger.info("generating silico data")
        num_genes = self.config.num_genes
        num_experiments = self.config.num_experiments
        num_pfs = self.config.num_pfs
        self.logger.info(f"    --number of genes:{num_genes}")
        self.logger.info(f"    --number of experiments:{num_experiments}")
        self.logger.info(f"    --number of pfs:{num_pfs}")

        self.logger.info("generating validation data")
        validation = self.generator_manager.generate_validation_data(
            num_genes=num_genes, num_pfs=num_pfs)
        self.logger.info("storing validation data")
        self.store.insert_validation(validation)
        self.logger.info("generating all fake experiments")
        providers = self.generator_manager.generate_all_experiments(
            validation, num_experiments, num_genes)
        self.logger.info("storing experiments data")
        for id, provider in enumerate(providers):
            self.logger.info(f"silico provider: {id+1}/{len(providers)}")
            for id, experiment in enumerate(provider):
                self.logger.info(f"silico experiment {id+1}/{len(provider)}")
                self.store.insert_geo(experiment)
        self.logger.info("done")
Beispiel #2
0
 def __init__(self, config_filename):
     self.logger = Utils.get_logger('generate_and_store_silico')
     config_raw = Utils.get_config(config_filename, "SilicoData")
     self.config = GenerateSilicoConfig(**config_raw)
     self.store = Storage(config_filename)
     self.generator_manager = SilicoGeneratorsManager(config_filename)
     self.generator_manager.setup()
Beispiel #3
0
 def __init__(self, config_filename):
     self.logger = Utils.get_logger("Benchmark")
     self.config = Utils.get_config(config_filename, "Benchmark")
     self.config = BenchamarkDiffMethodsConfig(logger=self.logger,
                                               **self.config)
     self.method_manager = DiffMethodsManager(config_filename)
     self.method_manager.setup()
     self.storage = Storage(config_filename)
     self.metric_manager = MetricManager(config_filename)
     self.metric_manager.setup()
Beispiel #4
0
 def __init__(self, config_filename):
     GEOparse.set_verbosity("ERROR")
     self.config_filename = config_filename
     data_section = Utils.get_config(config_filename, 'GEOImporter')
     self.config = GEOImporterConfig(**data_section)
     self.logger = Utils.get_logger('GEOImporter')
     self.storage = Storage(config_filename)
     self.labels = self.config.labeling
     self.inputs = self.config.input_data
     self.control_labels = self.labels.control
     self.type_labels = self.labels.type
     self.gene_names = self.labels.gene_names
     self.path = self.config.data_path
     self.experiment_collumns = {}
Beispiel #5
0
class GEOImporter:
    def __init__(self, config_filename):
        GEOparse.set_verbosity("ERROR")
        self.config_filename = config_filename
        data_section = Utils.get_config(config_filename, 'GEOImporter')
        self.config = GEOImporterConfig(**data_section)
        self.logger = Utils.get_logger('GEOImporter')
        self.storage = Storage(config_filename)
        self.labels = self.config.labeling
        self.inputs = self.config.input_data
        self.control_labels = self.labels.control
        self.type_labels = self.labels.type
        self.gene_names = self.labels.gene_names
        self.path = self.config.data_path
        self.experiment_collumns = {}

    def __get_genes(self, gse):
        gene_label = Utils.find_in_array(self.gene_names, gse.table.columns)
        genes_read = gse.table[gene_label].tolist()
        return Utils.deduplicate_genes(genes_read)

    def __split_control_perturbed(self, gse, column_entry, type_idx):
        control_series = []
        perturbed_series = []
        for series_name, description in gse.columns.iterrows():
            if Utils.is_control(description[type_idx], self.control_labels):
                column_entry['control'] = description[type_idx]
                control_series.append(series_name)
            else:
                column_entry['perturbed'] = description[type_idx]
                perturbed_series.append(series_name)

        return [control_series, perturbed_series]

    def __fix_bad_data(self, data: GeoData):
        max_replicates = 6
        # limit data to most important because we don't have
        # enough memory to run the R methods with full data
        if len(data.control_array) < len(data.genes):
            # some experiments need to be transposed
            control = np.array(data.control_array).T.tolist()
            perturbed = np.array(data.perturbed_array).T.tolist()
        else:
            control = data.control_array
            perturbed = data.perturbed_array
        # for control pick the first replicates
        control = [x[:max_replicates] for x in control]
        # for perturbed pick the last replicates
        perturbed = [x[-max_replicates:] for x in perturbed]
        # the above pick was done to favorize timeseries experiments
        control = Utils.log_if_necessary(np.array(control))
        perturbed = Utils.log_if_necessary(np.array(perturbed))

        control = Utils.quantile_normalize(pd.DataFrame(control))
        perturbed = Utils.quantile_normalize(pd.DataFrame(perturbed))

        data.control_array = control.to_numpy().tolist()
        data.perturbed_array = perturbed.to_numpy().tolist()
        return data

    def __do_data_item(self, gse, geo_id, source_name, pf):
        if not hasattr(gse, 'columns'):
            geo_data = self.__do_no_colums_item(gse, geo_id, source_name, pf)
        else:
            geo_data = self.__do_columns_item(gse, geo_id, source_name, pf)

        return geo_data

    def __do_no_colums_item(self, gse, geo_id, source_name, pf):
        control = self.labels.no_column_control
        phenotype_data = gse.phenotype_data
        columns = phenotype_data.columns
        info_experiment_idx = columns.get_loc(self.labels.no_column_title)
        gsm_ids_idx = columns.get_loc(self.labels.no_column_accession)
        gsm_type = list(phenotype_data.values[:, info_experiment_idx])
        gsm_ids = list(phenotype_data.values[:, gsm_ids_idx])
        control_gsms = []
        perturbation_gsms = []
        raw_control_data = []
        raw_perturbed_data = []
        for idx in range(0, len(gsm_type)):
            gsm_id = gsm_ids[idx]
            table = gse.gsms[gsm_id].table
            value_idx = table.columns.get_loc('VALUE')
            values = gse.gsms[gsm_id].table.values[:, value_idx].tolist()
            if Utils.find_in_array(gsm_type[idx], control) != 'unknown':
                control_gsms.append(gsm_id)
                raw_control_data.append(values)
            else:
                perturbation_gsms.append(gsm_id)
                raw_perturbed_data.append(values)

        if not control_gsms:
            self.logger('[no col]no control for {geo_id}')
            return None

        genes = gse.gsms[control_gsms[0]].table.values[:, 0]
        np_control_raw = np.array(raw_control_data)
        np_perturbed_raw = np.array(raw_perturbed_data)

        control = Utils.repair_nan_fast(np_control_raw)
        perturbed = Utils.repair_nan_fast(np_perturbed_raw)

        self.logger.info(f'finished {geo_id}')
        geo_data = GeoData({
            "name": geo_id,
            "genes": genes.tolist(),
            "source": source_name,
            "perturbed_series_names": perturbation_gsms,
            "control_series_names": control_gsms,
            "extra_info": gse.metadata,
            "perturbed_array": perturbed.tolist(),
            "control_array": control.tolist(),
            "pf": pf
        })
        return geo_data

    def __do_columns_item(self, gse, geo_id, source_name, pf):
        iter_labels = list(self.type_labels)
        type_labels = self.type_labels
        column_entry = {}
        column_entry['all'] = iter_labels

        while (iter_labels):
            type_label = Utils.find_in_array(gse.columns, iter_labels)
            iter_labels.pop(0)

            if type_label == 'unknown':
                error_msg = f'no label geoid {geo_id} labels:{type_labels}'
                self.logger.error(error_msg)
                continue

            type_idx = gse.columns.columns.get_loc(type_label)
            gene_label = self.__get_genes(gse)

            if gene_label == 'unknown':
                self.logger.error(
                    f'no gene label for geoid {geo_id} labels:{type_labels}')
                continue

            control_series, perturbed_series = self.__split_control_perturbed(
                gse, column_entry, type_idx)

            if not control_series:
                continue

            if not perturbed_series:
                continue

            np_control_raw = gse.table[control_series].to_numpy()
            np_perturbed_raw = gse.table[perturbed_series].to_numpy()

            control = Utils.repair_nan_fast(np_control_raw)
            perturbed = Utils.repair_nan_fast(np_perturbed_raw)

            self.logger.info(f'finished {geo_id}')
            geo_data = GeoData({
                "name": geo_id,
                "genes": gene_label,
                "source": source_name,
                "perturbed_series_names": perturbed_series,
                "control_series_names": control_series,
                "extra_info": gse.metadata,
                "perturbed_array": perturbed.tolist(),
                "control_array": control.tolist(),
                "pf": pf
            })

            self.experiment_collumns[geo_id] = column_entry

            return geo_data

        error_msg = (f"could not split {geo_id} in 2 classes"
                     f"high cols: {gse.columns}")
        self.logger.error(error_msg)
        return None

    def __download_retry(self, geo_id, cache_path):
        gse = None
        retry_count = 0
        while True:
            self.logger.info(f"Downloading {geo_id}")
            self.logger.info(f"--Retry count {retry_count}")
            try:
                gse = GEOparse.get_GEO(geo=geo_id, destdir=cache_path)
            except IOError as err:
                self.logger.warning(f"Error downloading geo data {err}")
                if retry_count > self.config.download_retry_count:
                    gse = None
                    self.logger.error(f"Could not download {geo_id}")
                    break
                self.logger.warning(
                    f"Waiting for {self.config.retry_wait} seconds")
                time.sleep(self.config.retry_wait)
                continue
            break
        return gse

    def __do_input(self, input):
        cache_folder = self.config.cache_folder
        cache_path = os.path.join(self.path, cache_folder, input.name)
        created_folder = Utils.create_folder_if_not_exist(cache_path)
        if created_folder:
            self.logger.info(f"created directory {cache_path}")
        log_data = {}
        for data_item in input.data:
            geo_id = data_item['geoid']
            pf = data_item[input.pf_field].lower()
            info_msg = f'Getting GEO: {geo_id} in cache folder {cache_path}'
            self.logger.info(info_msg)
            gse = self.__download_retry(geo_id, cache_path)
            if gse is None:
                sys.exit(f"Failed to download data for {geo_id}")
            geo_data = self.__do_data_item(gse, geo_id, input.name, pf)
            if geo_data:
                geo_data = self.__fix_bad_data(geo_data)
                self.storage.insert_geo(geo_data)
        self.logger.info('Writing collumns to json file')
        log_str = json.dumps(log_data, sort_keys=True, indent=4)
        self.logger.info(f"{log_str}")
        self.logger.info(f'Finished importing GEO data for {input.name}')

    def importGEOData(self):
        labels = self.config.labeling
        inputs = self.config.input_data
        control_labels = self.labels.control
        type_labels = labels.type
        gene_names = labels.gene_names

        log_message = (f"reading geos"
                       f"--control labels:{control_labels}"
                       f"--type labels: {type_labels}"
                       f"--gene names: {gene_names}")
        self.logger.info(log_message)
        for input in inputs:
            self.logger.info(f"Loading data from file {input.file}")
            input.load()
            self.__do_input(input)
Beispiel #6
0
class BenchmarkDiffMethods:
    def __init__(self, config_filename):
        self.logger = Utils.get_logger("Benchmark")
        self.config = Utils.get_config(config_filename, "Benchmark")
        self.config = BenchamarkDiffMethodsConfig(logger=self.logger,
                                                  **self.config)
        self.method_manager = DiffMethodsManager(config_filename)
        self.method_manager.setup()
        self.storage = Storage(config_filename)
        self.metric_manager = MetricManager(config_filename)
        self.metric_manager.setup()

    def run_method(method_name: str, input: GeneDiffInput):
        pass

    def get_execution_map(self):
        # collect all methods so we don't run them twice on same inputs
        execution_map = {}

        for run in self.config.runs:
            for _, data in self.config.method_groups.items():
                for method_name in data.methods:
                    if method_name not in execution_map:
                        entry = set()
                        execution_map[method_name] = entry
                    else:
                        entry = execution_map[method_name]
                    new_sources = set(run.data_sources)
                    execution_map[method_name] = entry.union(new_sources)
        return execution_map

    def generate_method_results(self):
        execution_map = self.get_execution_map()
        data_source_cache = {}
        for method_name, execution_data in execution_map.items():
            for source in execution_data:
                self.logger.info(f"get data from: {source}")
                if source in data_source_cache:
                    experiments = data_source_cache[source]
                geos = self.storage.get_geo({'source': source})
                experiments = []
                for exp in geos:
                    gen_data = GeneData()
                    gen_data.gene_input = GeneDiffInput.from_geo_data(exp)
                    gen_data.name = exp.name
                    experiments.append(gen_data)
                data_source_cache[source] = experiments
                num_experiments = len(experiments)

                for id, gene_data in enumerate(experiments):
                    if self.storage.has_method_results(method_name,
                                                       gene_data.name):
                        exp_name = gene_data.name
                        self.logger.info(f"already computed [{exp_name}]")
                        continue
                    self.logger.info(f"running {id+1}/{num_experiments}")
                    self.logger.info(f"{method_name}[{gene_data.name}]")

                    res = self.method_manager.run(gene_data.gene_input,
                                                  method_name)
                    self.storage.insert_method_results(res, method_name,
                                                       gene_data.name)

    def generate_comparison_single(self, method_name, geodata, run, cache):
        filter = {'method_name': method_name, 'experiment_name': geodata.name}
        res = self.storage.get_method_results(filter)[0]
        for validation_set in run.validation_sets:
            self.logger.info(f"Getting validation data for {validation_set}")
            if validation_set not in cache:
                valid = self.storage.get_validation_data(
                    validation_set, geodata.pf)
                cache[validation_set] = valid

            valid = cache[validation_set]

            self.logger.info("Adding data to metrics")
            self.metric_manager.add(geodata.pf, method_name, valid, res)

    def generate_comparisons(self):
        validation_cache = {}
        self.logger.info("collecting metrics")
        for run in self.config.runs:
            all_geos = []
            for data_source in run.data_sources:
                all_geos.extend(self.storage.get_geo({"source": data_source}))
            for key, data in self.config.method_groups.items():
                for method_name in data.methods:
                    for geodata in all_geos:
                        self.generate_comparison_single(
                            method_name, geodata, run, validation_cache)
                self.logger.info(f"evaluating metrics for {key}")
                self.metric_manager.evaluate(key)
        self.logger.info("finished generating metrics")
Beispiel #7
0
def generate_all_feature_vectors(out_file_name):
    storage = Storage("config.json")
    logger = Utils.get_logger("generate_feature_vectors")
    logger.info("getting all validation sources")
    valid_sources = list(storage.get_validation_sources())
    logger.info(f"we have {len(valid_sources)} validation sources")
    logger.info("filtering validation sources")

    valid_sources = list(filter(lambda x: x != 'all', valid_sources))
    logger.info(f"we now have {len(valid_sources)} validation sources")
    validation_cache = {}
    logger.info("getting all geos")
    geos = storage.get_geo({})
    logger.info(f"we have total {len(geos)}")
    logger.info("filtering geos")
    #geos = list(filter(lambda x: 'silico' not in x.source, geos))
    logger.info(f"after filtering geos we have {len(geos)}")
    logger.info("populating validation cache")
    for idx, valid_source in enumerate(valid_sources):
        logger.info(f"adding to cache {idx}/{len(valid_sources)}")
        # collect all pfs
        pfs = set()
        for geo in geos:
            pf = geo.pf.lower()
            pfs.add(pf)

        for pf in pfs:
            valid = storage.get_validation_data(valid_source, pf)
            valid_data = valid.data
            if pf in valid_data:
                if pf not in validation_cache:
                    validation_cache[pf] = set()
                genes = valid_data[pf]
                genes = [x.lower() for x in genes]
                validation_cache[pf].update(genes)

    logger.info("generating training data from geos")
    all_data = []
    midget_instance = MIDGETNeural()
    running_sum = 0
    for idx, geo in enumerate(geos):
        pf = geo.pf.lower()
        if pf not in validation_cache:
            logger.error(f"could not find {pf} in validation set, skipping")
            continue
        logger.info(f"parsing geo {idx}/{len(geos)}")
        control = np.array(geo.control_array)
        perturbed = np.array(geo.perturbed_array)
        valid_genes = validation_cache[pf]
        geo_genes = []
        for gene in geo.genes:
            if isinstance(gene, str):
                geo_genes.append(gene.lower())
            else:
                geo_genes.append("__")
        y = [int(gene in valid_genes) for gene in geo_genes]
        y = np.array(y)
        running_sum += np.sum(y)
        num_genes = len(geo.genes)
        for row_idx in range(0, num_genes):
            feature_vector = midget_instance.get_feature_vector(
                control[row_idx], perturbed[row_idx])
            fy = y[row_idx]
            all_data.append({"x": feature_vector, "y": fy})
        logger.info(f"current positives {running_sum}")
    logger.info(f"writing file with {running_sum} positives")
    with open(out_file_name, 'wb') as f:
        pickle.dump(all_data, f)
    logger.info("done")
Beispiel #8
0
 def __init__(self, config):
     self.collect = {}
     self.storage = Storage(config)
     validation_section = Utils.get_config(config, 'ValidationDataImporter')
     self.config = ValidationDataImporterConfig(**validation_section)
     self.logger = Utils.get_logger('ValidationDataImporter')
Beispiel #9
0
class ValidationDataImporter:

    def __init__(self, config):
        self.collect = {}
        self.storage = Storage(config)
        validation_section = Utils.get_config(config, 'ValidationDataImporter')
        self.config = ValidationDataImporterConfig(**validation_section)
        self.logger = Utils.get_logger('ValidationDataImporter')

    def importValidationData(self):
        self.logger.info("importing validation data")
        sources = self.config.sources

        self.logger.info("importing drug gene interaction")
        drug_gene_interaction = sources['DRUG GENE INTERACTION']
        drug_file = drug_gene_interaction['data_file']
        dgidb_data = self.drug_gene_interaction_db(self.config.base_path,
                                                   drug_file)

        self.logger.info("importing encode")
        encode_source = sources['ENCODE']
        encode_data = self.__from_attribute_matrix('encode',
                                                   encode_source['dict_file'],
                                                   encode_source['data_file'],
                                                   self.config.base_path)

        self.logger.info("importing chea")
        chea_source = sources['CHEA']
        chea_data = self.__from_attribute_matrix('chea',
                                                 chea_source['dict_file'],
                                                 chea_source['data_file'],
                                                 self.config.base_path)

        self.logger.info("importing pp interaction")
        ppi_int = sources['PP INTERACTION']
        ppi_int_data = self.ppi_interaction(self.config.base_path,
                                            ppi_int['data_file'])

        self.logger.info("importing ppi study")
        ppi_tfs = sources['PPI STUDY']
        ppi_tfs_data = self.ppi_tf(self.config.base_path,
                                   ppi_tfs['data_file'])

        store_to_db = {
            'tf_encode': encode_data,
            'tf_chea':  chea_data,
            'pp_interaction': ppi_int_data,
            'ppis_study': ppi_tfs_data
        }
        store_to_db['all'] = self.merge_all(store_to_db)
        store_to_db['drug_gene_interaction'] = dgidb_data
        self.logger.info('storing to validation data..')
        for key, data in store_to_db.items():
            data = GeneDiffValidation({'source': key, 'data': data})
            self.storage.insert_validation(data)
        self.logger.info('done..')

    def __check_add_collection(self, collection, p1, p2):
        def check_add_one(p1, p2, collection):
            if p1 in collection:
                if p2 not in collection[p1]:
                    collection[p1].add(p2)
                    return True
            return False

        if p1 not in collection and p2 not in collection:
            collection[p1] = set([p2])
            return True

        if check_add_one(p1, p2, collection):
            return True

        if check_add_one(p2, p1, collection):
            return True

        return False

    def __load_dict_attributes(self, dict_path):
        attributes = {}
        with open(dict_path) as tsvfile:
            reader = self.__get_tsv_reader(tsvfile)
            for row in reader:
                attributes[row['GeneID']] = row['GeneSym']
        return attributes

    def __get_tsv_reader(self, tsvfile):
        return csv.DictReader(
            filter(lambda row: row[0] != '#', tsvfile), dialect='excel-tab')

    def __from_attribute_matrix(self, name, dict_file_path,
                                data_file_path, base_path):

        dict_path = os.path.join(base_path, dict_file_path)
        data_path = os.path.join(base_path, data_file_path)
        collect = {}
        new_links = 0
        new_tfs = 0
        attributes = {}
        self.logger.info(f"Doing db {name}")
        attributes = self.__load_dict_attributes(dict_path)

        with open(data_path) as tsvfile:
            reader = self.__get_tsv_reader(tsvfile)
            for row in reader:
                popGene = row['GeneSym'].lower()
                for key, value in row.items():
                    if key not in attributes:
                        continue
                    value_float = float(value)
                    if Utils.isclose(value_float, 0.0):
                        continue
                    tf_name = attributes[key].lower()
                    if tf_name in collect:
                        if popGene not in collect[tf_name]:
                            collect[tf_name].add(popGene)
                            new_links = new_links + 1
                    else:
                        collect[tf_name] = set([popGene])
                        new_tfs = new_tfs + 1
        message = (f"TF db:{name }",
                   f"new tfs: {new_tfs}",
                   f"new links: {new_links}")
        self.logger.info(message)
        return collect

    def ppi_interaction(self, base_path, data_file) -> GeneDiffValidation:
        collect = {}
        new_entries = 0
        pina_filename = os.path.join(base_path, data_file)
        with open(pina_filename) as tsvfile:
            reader = self.__get_tsv_reader(tsvfile)
            for row in reader:
                try:
                    proteinALong = row['Alt. ID(s) interactor A']
                    proteinBLong = row['Alt. ID(s) interactor B']
                    proteinAShort = re.search('uniprotkb:(.*)\(gene name\)',
                                              proteinALong).group(1).lower()
                    proteinBShort = re.search('uniprotkb:(.*)\(gene name\)',
                                              proteinBLong).group(1).lower()

                    if self.__check_add_collection(collect, proteinAShort,
                                                   proteinBShort):
                        new_entries = new_entries + 1
                except Exception:
                    print(str(row))
        self.logger.info(f'PP interaction network {new_entries} new links')
        return collect

    def ppi_tf(self, base_path, data_file) -> GeneDiffValidation:
        collect = {}
        new_entries = 0
        pina_filename = os.path.join(base_path, data_file)
        with open(pina_filename) as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                for index, gene_name in enumerate(row):
                    if index == 0:
                        continue
                    if len(gene_name) == 0:
                        continue

                    if self.__check_add_collection(collect,
                                                   row[0].lower(),
                                                   gene_name.lower()):
                        new_entries = new_entries + 1
        self.logger.info(f'PP TF network {new_entries} new links')
        return collect

    def drug_gene_interaction_db(self, base_path, data_file):
        collect = {}
        DRUG_NAME_IDX = 7
        GENE_NAME_IDX = 0
        DRUG_CLAIM_NAME_IDX = 6
        filename = os.path.join(base_path, data_file)
        with open(filename) as tsvfile:
            tsvreader = csv.reader(tsvfile, delimiter="\t")
            for idx, line in enumerate(tsvreader):
                if idx == 0:
                    continue
                drug = line[DRUG_NAME_IDX].lower()
                gene = line[GENE_NAME_IDX].lower()
                if drug.isspace() or drug == '':
                    drug = line[DRUG_CLAIM_NAME_IDX].lower()
                if drug not in collect:
                    collect[drug] = []
                collect[drug].append(gene)
        return collect

    def merge_all(self, all_tf_dicts):
        collect = {}
        for source, source_data in all_tf_dicts.items():
            for tf_name, genes in source_data.items():
                if tf_name not in collect:
                    collect[tf_name] = genes
                else:
                    collect[tf_name] = collect[tf_name].union(genes)
        return collect