Exemple #1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        #from genomicode import filelib
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        simple_file = in_data.identifier
        metadata = {}

        num_callers = mlib.get_user_option(user_options,
                                           "num_callers",
                                           not_empty=True,
                                           type=int)
        assert num_callers >= 0 and num_callers < 100

        var_matrix = SimpleVariantMatrix.read(simple_file)
        annot_matrix = var_matrix.annot_matrix
        call_matrix = var_matrix.call_matrix

        # For each coord and sample, count the number of callers.
        coord2sample2nc = {}  # (chrom, pos, ref, alt) -> sample -> num callers
        for x in call_matrix.coord2samplecaller2call.iteritems():
            coord, samplecaller2call = x
            if coord not in coord2sample2nc:
                coord2sample2nc[coord] = {}
            sample2nc = coord2sample2nc[coord]
            for (sample, caller), call in samplecaller2call.iteritems():
                # Make sure this is a real call.
                if not (call.num_ref or call.num_alt or call.total
                        or call.vaf):
                    continue
                sample2nc[sample] = sample2nc.get(sample, 0) + 1

        # Make a list of the coordinates that have the right number of calls.
        calls = {}  # coord -> sample -> nc
        for coord, sample2nc in coord2sample2nc.iteritems():
            for sample, nc in sample2nc.iteritems():
                if nc < num_callers:
                    continue
                if coord not in calls:
                    calls[coord] = {}
                calls[coord][sample] = nc

        handle = open(out_filename, 'w')

        # Print out the matrix.
        header = annot_matrix.headers + var_matrix.samples
        print >> handle, "\t".join(header)

        # Cache for convenience.
        j2annots = {}
        for j, h in enumerate(annot_matrix.headers_h):
            annots = annot_matrix.header2annots[h]
            j2annots[j] = annots
        num_annots = len(j2annots)

        chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        pos = [int(x) for x in pos]
        for i, coord in enumerate(zip(chrom, pos, ref, alt)):
            if coord not in calls:
                continue

            row0 = [None] * num_annots
            for j in range(num_annots):
                row0[j] = j2annots[j][i]
            row1 = [""] * len(var_matrix.samples)
            for j, sample in enumerate(var_matrix.samples):
                if sample in calls[coord]:
                    row1[j] = coord2sample2nc[coord][sample]

            row = row0 + row1
            assert len(row) == len(header)
            print >> handle, "\t".join(map(str, row))

        return metadata
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_filename):
        import arrayio
        from genomicode import filelib
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix

        simple_node, signal_node = antecedents
        filelib.assert_exists_nz(simple_node.identifier)
        filelib.assert_exists_nz(signal_node.identifier)

        # Read the variant file.
        SVM = SimpleVariantMatrix.read(simple_node.identifier)
        #AM = SVM.annot_matrix
        #assert GENE_H in AM.headers

        # Read the gene expression file.
        GXP = arrayio.read(signal_node.identifier)

        # Make sure the samples from the variant matrix can be found
        # in the gene expression matrix.
        GXP_samples = GXP.col_names(arrayio.COL_ID)
        missing = [x for x in SVM.samples if x not in GXP_samples]
        assert len(missing) < len(SVM.samples), (
            "SimpleVariantMatrix and gene expression file have "
            "no common samples.")
        # Actually, may not have all the same samples.  For example, a
        # gene expression profile might not have been calculated for
        # the germline sample.  So ignore if something is missing.
        #x = missing
        #if len(x) > 5:
        #    x = x[:5] + ["..."]
        #msg = "Samples (%d) not found in gene expression file: %s" % (
        #    len(missing), ", ".join(x))
        #assert not missing, msg

        # Add all the samples from the gene expression file.
        SAMPLES = GXP_samples

        # Find the genes in each row.
        GENE_H = "Gene.refGene"
        annovar_matrix = None
        for (name, matrix) in SVM.named_matrices:
            if GENE_H in matrix.headers:
                annovar_matrix = matrix
                break
        assert annovar_matrix, "Missing annotation: %s" % GENE_H
        GENES = annovar_matrix[GENE_H]

        # Make a list of the genes.
        genes = {}
        for i, gene_str in enumerate(GENES):
            # Format of genes:
            # PFN1P2
            # PMS2P2,PMS2P7
            for x in gene_str.split(","):
                genes[x] = 1
        genes = sorted(genes)

        # Make a matrix of the gene expression values for each gene
        # and each sample.
        #I = [GXP_samples.index(x) for x in SVM.samples]
        #GXP_a = GXP.matrix(genes, I)  # align the matrices.
        GXP_a = GXP.matrix(genes, None)
        
        # Write out the expression matrix for debugging purposes.
        arrayio.write(GXP_a, "expression.txt")

        # Search for each of the genes in the matrix.
        gene2I = {}   # gene -> list of row indexes
        for gene in genes:
            x = GXP_a._index(row=gene)
            I_row, i_col = x
            if I_row:
                gene2I[gene] = I_row

        # Align the gene expression matrix to the simple variant
        # matrix.
        #matrix = [[None]*len(SVM.samples) for i in range(len(GENES))]
        matrix = [[None]*len(SAMPLES) for i in range(len(GENES))]
        for i, gene_str in enumerate(GENES):
            # Format of genes:     Format of output
            # PFN1P2                  5.2
            # PMS2P2,PMS2P7           2.2,8.6
            # If a gene is missing, then skip it.
            genes = gene_str.split(",")
            #for j in range(len(SVM.samples)):
            for j in range(len(SAMPLES)):
                values = []  # expression values for each gene.
                for k in range(len(genes)):
                    if genes[k] not in gene2I:
                        continue
                    x = [GXP_a._X[l][j] for l in gene2I[genes[k]]]
                    # If there are multiple instances of this gene,
                    # then pick the one with the maximum expression.
                    x = max(x)
                    values.append(x)
                values = [_pretty_gxp(x) for x in values]
                x = ",".join(values)
                matrix[i][j] = x

        # Add the matrix back to the simple variant matrix.
        #headers = SVM.samples
        headers = SAMPLES
        all_annots = []
        for j in range(len(headers)):
            x = [matrix[i][j] for i in range(len(matrix))]
            all_annots.append(x)
        x = AnnotationMatrix.create_from_annotations(headers, all_annots)
        SVM.named_matrices.append(("Gene Expression", x))

        # Write to file.
        SimpleVariantMatrix.write(out_filename, SVM)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        #import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        summary_node = in_data
        summary_filename = summary_node.identifier
        metadata = {}

        buildver = mlib.get_user_option(user_options,
                                        "annovar_buildver",
                                        allowed_values=["hg19"],
                                        not_empty=True)

        # Name files.
        p, root, ext = mlib.splitpath(summary_filename)
        annovar_infile = "pos.txt"
        log_filename = "annovar.log"
        # Annovar takes a filestem, without the ".vcf".
        annovar_outstem = "annotations"
        # Produces file:
        # <annovar_outstem>.hg19_multianno.txt
        multianno_file = "%s.hg19_multianno.txt" % annovar_outstem
        #temp_file = "temp.txt"

        # Make the infile for Annovar.
        # <chrom> <start> <end> <ref> <alt>
        handle = open(annovar_infile, 'w')
        for d in filelib.read_row(summary_filename, skip=2, header=1):
            x = d.Chrom, d.Pos, d.Pos, d.Ref, d.Alt
            print >> handle, "\t".join(x)
        handle.close()

        cmd = alignlib.make_annovar_command(annovar_infile,
                                            log_filename,
                                            annovar_outstem,
                                            buildver,
                                            vcf_input=False)
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]

        filelib.assert_exists_nz(log_filename)
        filelib.assert_exists_nz(multianno_file)

        matrix = SimpleVariantMatrix.read(summary_filename)
        annot_matrix = matrix.annot_matrix
        #headers = annot_matrix.headers + anno_header[5:]
        chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        pos = [int(x) for x in pos]

        # Read in the multianno output file.
        pos2d = {}  # (chrom, start, ref, alt) -> d
        anno_header = None
        for d in filelib.read_row(multianno_file, header=1):
            key = d.Chr, int(d.Start), d.Ref, d.Alt
            assert key not in pos2d, "Duplicate pos: %s" % str(key)
            pos2d[key] = d
            if not anno_header:
                anno_header = d._header
        assert anno_header

        # Multianno starts with:
        # Chr Start End Ref Alt
        # Ignore these.
        assert anno_header[:5] == ["Chr", "Start", "End", "Ref", "Alt"]
        headers = anno_header[5:]

        all_annots = []
        #for h in annot_matrix.headers_h:
        #    x = annot_matrix.header2annots[h]
        #    all_annots.append(x)
        for i in range(5, len(anno_header)):
            annots = []
            for coord in zip(chrom, pos, ref, alt):
                d = pos2d.get(coord)
                x = ""
                if d:
                    x = d._cols[i]
                annots.append(x)
            all_annots.append(annots)
        x = AnnotationMatrix.create_from_annotations(headers, all_annots)
        matrix.named_matrices.insert(0, ("Annovar", x))

        SimpleVariantMatrix.write(out_filename, matrix)

        ## cols_to_add = len(anno_header) - 5
        ## assert cols_to_add > 0

        ## # Merge the multianno file with the simple call summary.  Add
        ## # these columns before the <Sample>.
        ## # Sample                <Sample>
        ## # Caller                <Caller>
        ## # Chrom  Pos  Ref  Alt  Ref/Alt/VAF
        ## handle = open(temp_file, 'w')
        ## it = filelib.read_cols(summary_filename)
        ## header1 = it.next()
        ## header2 = it.next()
        ## header3 = it.next()
        ## assert len(header1) == len(header2), "%d %d %d %s" % (
        ##     len(header1), len(header2), len(header3), summary_filename)
        ## assert len(header1) == len(header3), "%d %d %d %s" % (
        ##     len(header1), len(header2), len(header3), summary_filename)
        ## assert header1[0] == "Sample"
        ## assert header2[0] == "Caller"
        ## assert header3[:4] == ["Chrom", "Pos", "Ref", "Alt"]
        ## header1 = header1[:4] + [""]*cols_to_add + header1[4:]
        ## header2 = header2[:4] + [""]*cols_to_add + header2[4:]
        ## header3 = header3[:4] + anno_header[5:] + header3[4:]
        ## print >>handle, "\t".join(header1)
        ## print >>handle, "\t".join(header2)
        ## print >>handle, "\t".join(header3)
        ## for cols in it:
        ##     chrom, pos, ref, alt = cols[:4]
        ##     pos = int(pos)
        ##     d = pos2d.get((chrom, pos))
        ##     if not d:
        ##         cols = cols[:4] + [""]*cols_to_add + cols[4:]
        ##         continue
        ##     assert ref == d.Ref, "%s %s %s %s %s %s" % (
        ##         chrom, pos, ref, alt, d.Ref, d.Alt)
        ##     assert alt == d.Alt, "%s %s %s %s %s %s" % (
        ##         chrom, pos, ref, alt, d.Ref, d.Alt)
        ##     x = d._cols[5:]
        ##     assert len(x) == cols_to_add
        ##     cols = cols[:4] + x + cols[4:]
        ##     print >>handle, "\t".join(cols)
        ## handle.close()

        ## shutil.move(temp_file, out_filename)

        return metadata
def add_coverage_to_svm(svm_file, coverage_file, outfile, is_rna_cov):
    from genomicode import jmath
    from genomicode import filelib
    from genomicode import AnnotationMatrix
    from genomicode import SimpleVariantMatrix
    
    # Read the variant file.
    SVM = SimpleVariantMatrix.read(svm_file)
    AM = SVM.annot_matrix
    assert "Chrom" in AM.headers
    assert "Pos" in AM.headers
    CHROM = AM["Chrom"]
    POS = AM["Pos"]
    POS = [int(x) for x in POS]

    # Read the coverage matrix.
    # Chrom  Pos  <Sample>  [<Sample> ...]
    # Pos is 1-based.
    coord2sample2cov = {}  # (chrom, pos) -> sample -> ref/alt/vaf
    cov_samples = {}
    for d in filelib.read_row(coverage_file, header=1):
        coord = d.Chrom, int(d.Pos)
        if coord not in coord2sample2cov:
            coord2sample2cov[coord] = {}
        for i in range(2, len(d._header)):
            sample = d._header[i]
            cov = d._cols[i]
            if not cov:
                continue
            #coord2sample2cov[coord][sample] = int(cov)
            coord2sample2cov[coord][sample] = cov
            cov_samples[sample] = 1

    # Make sure the samples from the variant matrix can be found
    # in the coverage matrix.
    missing = [x for x in SVM.samples if x not in cov_samples]
    assert len(missing) < len(SVM.samples), (
        "SimpleVariantMatrix and coverage file have "
        "no common samples.")
    # If the samples aren't sequenced at high coverage, it's
    # possible they just don't have reads at these positions.  Be
    # a little lenient here, and accept the file if some of the
    # samples overlap.
    #x = missing
    #if len(x) > 5:
    #    x = x[:5] + ["..."]
    #msg = "Samples (%d) not found in coverage file: %s" % (
    #    len(missing), ", ".join(x))
    #assert not missing, msg
    # Report the coverage for the samples at the intersection.
    SAMPLES = [x for x in SVM.samples if x in cov_samples]

    # Align the matrix to the simple variant matrix.
    #matrix = [[None]*len(SVM.samples) for i in range(AM.num_annots())]
    matrix = [[None]*len(SAMPLES) for i in range(AM.num_annots())]
    for i in range(AM.num_annots()):
        coord = CHROM[i], POS[i]
        sample2cov = coord2sample2cov.get(coord, {})
        x = [sample2cov.get(x, "") for x in SAMPLES]
        #x = map(str, x)
        matrix[i] = x

    # Add the matrix back to the simple variant matrix.
    headers = SAMPLES
    all_annots = jmath.transpose(matrix)
    name = "Coverage"
    # If this is being used to add RNA coverage, use a different
    # name.
    if is_rna_cov:
        name = "RNA Coverage"
    x = AnnotationMatrix.create_from_annotations(headers, all_annots)
    SVM.named_matrices.append((name, x))

    # Write to file.
    SimpleVariantMatrix.write(outfile, SVM)