Esempio n. 1
0
def oncodriveclust(project):
    log = task.logger
    conf = task.conf

    log.info("--- [{0}] --------------------------------------------".format(project["id"]))

    source_genes = {}
    syn_genes = set()
    selected_genes = set()
    filter_genes = set()
    threshold_genes = set()

    source_samples = {}
    selected_samples = set()
    filter_samples = set()
    threshold_samples = set()

    selected_gene_sample_count = {}  # number of samples for each selected gene
    filter_gene_sample_count = {}  # number of samples per each gene passing the filter

    # get configuration

    samples_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project)

    log.info("Retrieving gene alterations ...")

    projdb = ProjectDb(project["db"])

    data = set()

    for csq in projdb.consequences(join_samples=True):
        # filters={ProjectDb.CSQ_CTYPES : so.PROTEIN_AFFECTING | so.SYNONYMOUS}):

        is_selected = so.match(csq.ctypes, so.PROTEIN_AFFECTING)
        is_synonymous = so.match(csq.ctypes, so.SYNONYMOUS)

        if csq.gene not in source_genes:
            source_genes[csq.gene] = gene_index = len(source_genes)

        if is_selected:
            selected_genes.add(gene_index)

        if is_synonymous:
            syn_genes.add(gene_index)

        for sample in csq.var.samples:
            if sample.name not in source_samples:
                source_samples[sample.name] = sample_index = len(source_samples)

            if is_selected:
                selected_samples.add(sample_index)
                data.add((csq.gene, sample_index))

    projdb.close()

    log.info("Counting selected, filtered and threshold ...")

    # calculate selected and filter counts

    data2 = set()

    for gene, sample_index in data:
        gene_index = source_genes[gene]
        if gene_index not in selected_gene_sample_count:
            selected_gene_sample_count[gene_index] = 1
        else:
            selected_gene_sample_count[gene_index] += 1

        if filt.valid(gene):
            data2.add((gene_index, sample_index))
            filter_genes.add(gene_index)
            filter_samples.add(sample_index)
            if gene_index not in filter_gene_sample_count:
                filter_gene_sample_count[gene_index] = 1
            else:
                filter_gene_sample_count[gene_index] += 1

                # calculate threshold counts

    for gene_index, sample_index in data2:
        if selected_gene_sample_count[gene_index] >= samples_threshold:
            threshold_genes.add(gene_index)
            threshold_samples.add(sample_index)

    log.info("Counting significant genes ...")

    # significance of q-values

    projdb = ProjectDb(project["db"])
    sig_thresholds = [0.0, 0.001, 0.005] + [i / 100.0 for i in range(1, 11)] + [1.0]
    sig_count = [0] * len(sig_thresholds)
    for gene in projdb.genes():
        if gene.id in source_genes and source_genes[gene.id] in threshold_genes:
            i = 0
            while i < len(sig_thresholds) and gene.fm_qvalue > sig_thresholds[i]:
                i += 1

            for j in range(i, len(sig_count)):
                sig_count[j] += 1

    projdb.close()

    source_genes_count = len(source_genes)
    syn_genes_count = len(syn_genes)
    selected_genes_count = len(selected_genes)
    filter_genes_count = len(filter_genes)
    threshold_genes_count = len(threshold_genes)

    source_samples_count = len(source_samples)
    selected_samples_count = len(selected_samples)
    filter_samples_count = len(filter_samples)
    threshold_samples_count = len(threshold_samples)

    sorted_filter_genes = sorted(filter_genes, reverse=True, key=lambda gi: filter_gene_sample_count[gi])

    qc_data = dict(
        source=dict(
            genes=sorted(source_genes.keys(), key=lambda k: source_genes[k]),
            genes_count=source_genes_count,
            genes_lost_count=max(0, source_genes_count - syn_genes_count - threshold_genes_count),
            samples=sorted(source_samples.keys(), key=lambda k: source_samples[k]),
            samples_count=source_samples_count,
        ),
        samples_lost_count=max(0, source_samples_count - threshold_samples_count),
        synonymous=dict(
            genes=sorted(syn_genes),
            genes_count=syn_genes_count,
            ratio=(float(syn_genes_count) / selected_genes_count) if selected_genes_count > 0 else 0,
        ),
        selected=dict(
            genes=sorted(selected_genes),
            genes_count=selected_genes_count,
            genes_lost=sorted(set(source_genes.values()) - syn_genes - selected_genes),
            genes_lost_count=max(0, source_genes_count - syn_genes_count - selected_genes_count),
            samples=sorted(selected_samples),
            samples_count=selected_samples_count,
            samples_lost=sorted(set(source_samples.values()) - selected_samples),
            samples_lost_count=max(0, source_samples_count - selected_samples_count),
        ),
        filter=dict(
            genes=sorted_filter_genes,
            genes_count=filter_genes_count,
            genes_lost=sorted(selected_genes - filter_genes),
            genes_lost_count=max(0, selected_genes_count - filter_genes_count),
            genes_sample_count=[filter_gene_sample_count[gene_index] for gene_index in sorted_filter_genes],
            samples=sorted(filter_samples),
            samples_count=filter_samples_count,
            samples_lost=sorted(selected_samples - filter_samples),
            samples_lost_count=max(0, selected_samples_count - filter_samples_count),
        ),
        threshold=dict(
            genes=sorted(threshold_genes),
            genes_count=threshold_genes_count,
            genes_lost=sorted(filter_genes - threshold_genes),
            genes_lost_count=max(0, filter_genes_count - threshold_genes_count),
            samples=sorted(threshold_samples),
            samples_count=threshold_samples_count,
            samples_threshold=samples_threshold,
            samples_lost=sorted(filter_samples - threshold_samples),
            samples_lost_count=max(0, filter_samples_count - threshold_samples_count),
        ),
        results=dict(sig_thresholds=sig_thresholds[1:], sig_count=sig_count[1:]),
    )

    project_results = ProjectResults(project)
    project_results.save_quality_control("oncodriveclust", qc_data)
Esempio n. 2
0
def prepare_files(project):
	log = task.logger
	conf = task.conf

	projects_out_port = task.ports("projects_out")

	project_id = project["id"]
	log.info("--- [{0}] --------------------------------------------".format(project_id))

	project_results = ProjectResults(project)

	mutations_threshold, genes_filter_enabled, genes_filter, filt = get_oncodriveclust_configuration(log, conf, project)

	log.info("Loading transcripts CDS length ...")

	cds_len = load_cds_len(conf)

	log.info("Retrieving gene alterations ...")

	projdb = ProjectDb(project["db"])

	data = retrieve_data(projdb, cds_len)

	projdb.close()

	data_paths = [
		os.path.join(project["temp_path"], "oncodriveclust-non-syn-data.tsv"),
		os.path.join(project["temp_path"], "oncodriveclust-syn-data.tsv")]

	log.info("Saving data ...")
	log.debug("> {0}".format(data_paths[NON_SYN]))
	log.debug("> {0}".format(data_paths[SYN]))

	df = [tsv.open(path, "w") for path in data_paths]

	gene_sample_count = {}

	for key, value in data.items():
		findex, gene, sample = key
		transcript, transcript_len, protein_pos = value

		if findex == NON_SYN:
			if gene not in gene_sample_count:
				gene_sample_count[gene] = 1
			else:
				gene_sample_count[gene] += 1

			if genes_filter_enabled and not filt.valid(gene):
				continue

		tsv.write_line(df[findex], gene, sample, protein_pos)

	for f in df:
		f.close()

	exc_path = os.path.join(project["temp_path"], "oncodriveclust-excluded-cause.tsv")

	log.info("Saving excluded gene causes ...")
	log.debug("> {0}".format(exc_path))

	with tsv.open(exc_path, "w") as exf:
		tsv.write_line(exf, "GENE", "EXCLUDED_CAUSE")
		for gene, sample_count in gene_sample_count.items():
			causes = []
			if genes_filter_enabled and not filt.valid(gene):
				causes += [ProjectDb.GENE_EXC_FILTER]
			if sample_count < mutations_threshold:
				causes += [ProjectDb.GENE_EXC_THRESHOLD]
			if len(causes) > 0:
				tsv.write_line(exf, gene, "".join(causes))

	log.info("Sending project ...")

	projects_out_port.send(dict(project,
								oncodriveclust=dict(
									data_paths=data_paths,
									mutations_threshold=mutations_threshold,
									genes_filter_enabled=genes_filter_enabled, # not used
									genes_filter=genes_filter))) # not used