Exemple #1
0
def filter_min_callers_in_any_sample(MATRIX, num_callers):
    from genomicode import AnnotationMatrix

    assert type(num_callers) is type(0)

    x = MATRIX.headers
    x = [x for x in x if x.startswith("Num Callers")]
    callers_h = x
    assert callers_h, 'Missing: "Gene Expression" columns'

    I_keep = []
    for i in range(MATRIX.num_annots()):
        keep = False
        for h in callers_h:
            if not MATRIX[h][i]:
                continue
            nc = int(MATRIX[h][i])
            if nc >= num_callers:
                keep = True
                break
        if keep:
            I_keep.append(i)

    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Exemple #2
0
def filter_min_coverage_in_every_sample(MATRIX, coverage):
    from genomicode import AnnotationMatrix

    assert type(coverage) is type(0)

    x = MATRIX.headers
    x = [x for x in x if x.startswith("Coverage")]
    sample_h = x
    assert sample_h, 'Missing: "Coverage" columns'

    I_keep = []
    for i in range(MATRIX.num_annots()):
        keep = True
        for h in sample_h:
            if not MATRIX[h][i]:
                keep = False
                break
            # Ref/Alt/VAF
            x = MATRIX[h][i]
            x = x.split("/")
            assert len(x) == 3
            cov = int(x[0]) + int(x[1])
            if cov < coverage:
                keep = False
                break
        if keep:
            I_keep.append(i)
    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Exemple #3
0
def filter_min_gene_expression_in_every_sample(MATRIX, gxp):
    # Gene expression >= 1 in all samples.
    from genomicode import AnnotationMatrix

    assert type(gxp) is type(0.0)

    x = MATRIX.headers
    x = [x for x in x if x.startswith("Gene Expression")]
    sample_h = x
    assert sample_h, 'Missing: "Gene Expression" columns'

    I_keep = []
    for i in range(MATRIX.num_annots()):
        keep = True
        for h in sample_h:
            if not MATRIX[h][i]:
                keep = False
                break
            # 5.3
            # 0,0.379
            x = MATRIX[h][i]
            x = x.split(",")
            x = [float(x) for x in x]
            x = max(x)
            exp = x
            if exp < gxp:
                keep = False
                break
        if not keep:
            continue
        I_keep.append(i)

    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Exemple #4
0
def sort_vcf_file(filename):
    from genomicode import vcflib
    from genomicode import jmath
    from genomicode import AnnotationMatrix

    vcf = vcflib.read(filename)
    CHROM = vcf.matrix["#CHROM"]
    POS = vcf.matrix["POS"]
    POS = [int(x) for x in POS]

    # Check if POS is sorted.  If it's already sorted, then return.
    is_sorted = True
    for i in range(len(CHROM) - 1):
        c1, p1 = CHROM[i], POS[i]
        c2, p2 = CHROM[i + 1], POS[i + 1]
        if c1 != c2:
            continue
        if p2 < p1:
            is_sorted = False
            break
    if is_sorted:
        return

    # Sort by CHROM and POS.
    S = ["%s:%d" % (CHROM[i], POS[i]) for i in range(len(CHROM))]
    O = jmath.order_list(S, natural=True)
    vcf.matrix = AnnotationMatrix.rowslice(vcf.matrix, O)
    vcflib.write(filename, vcf)
Exemple #5
0
def filter_linked_perc(MATRIX, args):
    if args is None:
        return MATRIX
    from genomicode import AnnotationMatrix

    filter_perc = float(args)
    assert filter_perc >= 0 and filter_perc <= 100

    h = "Linkage______Perc Linked"
    perc_linked = MATRIX[h]

    I = []
    for i, perc in enumerate(perc_linked):
        if perc == "":
            I.append(i)
            continue
        perc = float(perc)
        if perc <= filter_perc:
            I.append(i)
    return AnnotationMatrix.rowslice(MATRIX, I)
Exemple #6
0
def filter_sift_polyphen_damaging(MATRIX):
    from genomicode import AnnotationMatrix

    x = [x for x in MATRIX.headers if x.endswith("SIFT_pred")]
    assert len(x) == 1
    SIFT_pred = MATRIX[x[0]]
    x = [x for x in MATRIX.headers if x.endswith("Polyphen2_HDIV_pred")]
    assert len(x) == 1
    hdiv_pred = MATRIX[x[0]]
    x = [x for x in MATRIX.headers if x.endswith("Polyphen2_HVAR_pred")]
    assert len(x) == 1
    hvar_pred = MATRIX[x[0]]

    I_keep = []
    for i, (sift, hdiv, hvar) in enumerate(zip(SIFT_pred, hdiv_pred,
                                               hvar_pred)):
        if sift == "D" and hdiv in ["D", "P"] and hvar in ["D", "P"]:
            I_keep.append(i)
    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Exemple #7
0
def filter_min_callers(MATRIX, args, germline):
    if args is None:
        return MATRIX
    from genomicode import AnnotationMatrix

    num_callers = args
    assert num_callers >= 1 and num_callers < 20

    I_nc = [
        i for (i, x) in enumerate(MATRIX.headers)
        if x.startswith("Num Callers")
    ]
    headers_nc = [MATRIX.headers_h[i] for i in I_nc]
    for i, h in enumerate(headers_nc):
        is_germ = False
        for g in germline:
            if h.endswith(g):
                is_germ = True
                break
        if is_germ:
            headers_nc[i] = None
    headers_nc = [x for x in headers_nc if x]

    I_remove = []
    for i in range(MATRIX.num_annots()):
        has_sample = False
        for h in headers_nc:
            x = MATRIX.header2annots[h][i]
            if not x.strip():
                continue
            nc = int(x)
            if nc >= num_callers:
                has_sample = True
                break
        if not has_sample:
            I_remove.append(i)

    x = {}.fromkeys(I_remove)
    I_keep = [i for i in range(MATRIX.num_annots()) if i not in x]
    filtered_matrix = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return filtered_matrix
Exemple #8
0
def exonic_only(MATRIX, args):
    if not args:
        return MATRIX
    from genomicode import AnnotationMatrix

    header = "Annovar______Func.refGene"
    assert header in MATRIX.headers_h

    I_keep = []
    func = MATRIX.header2annots[header]
    for i in range(len(func)):
        # exonic
        # ncRNA_exonic;splicing
        # exonic;splicing
        x = func[i]
        x = x.split(";")
        if "exonic" not in x:
            continue
        I_keep.append(i)
    MATRIX = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return MATRIX
Exemple #9
0
def filter_nonsynonymous(MATRIX):
    # Filter out synonymous variants.
    from genomicode import AnnotationMatrix

    # Make sure annotated with Annovar.
    HEADER = "Annovar______ExonicFunc.refGene"
    assert HEADER in MATRIX.headers, "Missing: ExonicFunc.refGene"
    exonic_func = MATRIX[HEADER]
    I_keep = []
    for i, efunc in enumerate(exonic_func):
        assert efunc in [
            "", "nonsynonymous SNV", "synonymous SNV",
            "stopgain", "stoploss",
            "frameshift substitution", "nonframeshift substitution",
            "unknown"], \
            "Unknown exonic_func: %s" % efunc
        if efunc in [
                "nonsynonymous SNV", "stopgain", "stoploss",
                "frameshift substitution"
        ]:
            I_keep.append(i)
    x = AnnotationMatrix.rowslice(MATRIX, I_keep)
    return x
Exemple #10
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import itertools
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        summary_file = in_data.identifier
        metadata = {}

        #x = mlib.get_user_option(
        #    user_options, "nonsynonymous_and_stopgain_only",
        #    allowed_values=["no", "yes"])
        #nonsynonymous_and_stopgain_only = (x == "yes")

        min_alt_reads = mlib.get_user_option(user_options,
                                             "filter_by_min_alt_reads",
                                             not_empty=True,
                                             type=int)
        assert min_alt_reads >= 0 and min_alt_reads < 10000

        min_total_reads = mlib.get_user_option(user_options,
                                               "filter_by_min_total_reads",
                                               not_empty=True,
                                               type=int)
        assert min_total_reads >= 0 and min_total_reads < 10000

        min_vaf = mlib.get_user_option(user_options,
                                       "filter_by_min_vaf",
                                       not_empty=True,
                                       type=float)
        assert min_vaf >= 0.0 and min_vaf < 1.0

        #min_gq = mlib.get_user_option(
        #    user_options, "filter_by_min_GQ", not_empty=True, type=float)
        #assert min_gq >= 0 and min_gq < 1000

        assert min_total_reads or min_alt_reads, "No filter"

        matrix = SimpleVariantMatrix.read_as_am(summary_file)
        #var_matrix = SimpleVariantMatrix.read(summary_file)
        #call_matrix = var_matrix.call_matrix
        #annot_matrix = var_matrix.annot_matrix

        #annovar_matrix = None
        #for (name, matrix) in var_matrix.named_matrices:
        #    if "ExonicFunc.refGene" in matrix.headers:
        #        annovar_matrix = matrix
        #        break
        #assert annovar_matrix, "Missing annotation: ExonicFunc.refGene"

        # copy.deepcopy is very slow.  Try to avoid it.
        # Strategy:
        # 1.  Make a list of the changes to be made.
        # 2.  Save the filtered rows.
        # 3.  Make the changes.
        # 4.  Save the non-filtered rows.
        I_remove = {}  # i -> 1
        call_remove = {}  # i -> (sample, caller) -> 1

        #CHROM = matrix.header2annots["______Chrom"]
        #POS = matrix.header2annots["______Pos"]
        #POS = [int(x) for x in POS]
        #REF = matrix.header2annots["______Ref"]
        #ALT = matrix.header2annots["______Alt"]

        # Optimization: normalize the headers for the samples and callers.
        sc2header = {}  # (sample, caller) -> header_h
        for sc in itertools.product(matrix.samples, matrix.callers):
            sample, caller = sc
            header = "%s___%s___Ref/Alt/VAF" % (sample, caller)
            header_h = matrix.normalize_header(header)
            assert header_h
            sc2header[sc] = header_h

        for i in range(matrix.num_annots()):
            has_calls = False  # whether this row has any calls.
            for sc in itertools.product(matrix.samples, matrix.callers):
                sample, caller = sc

                header_h = sc2header[sc]
                call_str = matrix.header2annots[header_h][i]
                if not call_str:
                    continue
                call = SimpleVariantMatrix._parse_call(call_str)

                filt = False
                # filter_by_min_alt_reads
                if min_alt_reads > 0 and \
                   (call.num_alt is None or call.num_alt < min_alt_reads):
                    filt = True
                # filter_by_min_total_reads
                if min_total_reads > 0 and (call.total is None
                                            or call.total < min_total_reads):
                    filt = True

                # filter_by_min_vaf
                if min_vaf >= 1E-6 and (call.vaf is None
                                        or call.vaf < min_vaf):
                    filt = True

                if filt:
                    if i not in call_remove:
                        call_remove[i] = {}
                    call_remove[i][sc] = 1
                else:
                    has_calls = True

            # If this coordinate has no more calls, then remove the
            # whole row.
            if not has_calls:
                I_remove[i] = 1
        I_remove = sorted(I_remove)

        # Write out a matrix of the discarded rows.
        filtered_matrix = AnnotationMatrix.rowslice(matrix, I_remove)
        SimpleVariantMatrix.write_from_am("discarded.txt", filtered_matrix)

        # Remove the calls.
        for i in call_remove:
            for sc in call_remove[i]:
                header_h = sc2header[sc]
                call_str = matrix.header2annots[header_h][i]
                assert call_str
                matrix.header2annots[header_h][i] = ""

        # Which rows to keep.
        I_remove_dict = {}.fromkeys(I_remove)
        I_keep = [
            i for i in range(matrix.num_annots()) if i not in I_remove_dict
        ]
        filtered_matrix = AnnotationMatrix.rowslice(matrix, I_keep)
        SimpleVariantMatrix.write_from_am(out_filename, filtered_matrix)

        ## ## Filter out synonymous variants.
        ## #if nonsynonymous_and_stopgain_only:
        ## #    # Make sure annotated with Annovar.
        ## #    assert "ExonicFunc.refGene" in annovar_matrix.headers
        ## #    exonic_func = annovar_matrix["ExonicFunc.refGene"]
        ## #    for i, efunc in enumerate(exonic_func):
        ## #        efunc = exonic_func[i]
        ## #        assert efunc in [
        ## #            "", "nonsynonymous SNV", "synonymous SNV",
        ## #            "stopgain", "stoploss",
        ## #            "frameshift substitution", "nonframeshift substitution",
        ## #            "unknown"], \
        ## #            "Unknown exonic_func: %s" % efunc
        ## #        if efunc not in ["nonsynonymous SNV", "stopgain"]:
        ## #            I_remove[i] = 1
        ## #            continue

        ## # Filter based on the calls.
        ## if min_alt_reads > 0 or min_total_reads > 0:
        ##     all_coord = call_matrix.coord2samplecaller2call.keys()
        ##     for coord in all_coord:
        ##         all_sc = call_matrix.coord2samplecaller2call[coord].keys()
        ##         for sc in all_sc:
        ##             # SimpleVariantMatrix.Call object.
        ##             call = call_matrix.coord2samplecaller2call[coord][sc]

        ##             # filter_by_min_alt_reads
        ##             if min_alt_reads > 0 and \
        ##                (call.num_alt is None or call.num_alt < min_alt_reads):
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ##             # filter_by_min_total_reads
        ##             if min_total_reads > 0 and (
        ##                 call.total is None or call.total < min_total_reads):
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ## # Filter based on VAF.
        ## if min_vaf >= 1E-6:
        ##     all_coord = call_matrix.coord2samplecaller2call.keys()
        ##     for coord in all_coord:
        ##         all_sc = call_matrix.coord2samplecaller2call[coord].keys()
        ##         for sc in all_sc:
        ##             call = call_matrix.coord2samplecaller2call[coord][sc]

        ##             # filter_by_min_vaf
        ##             if call.vaf is None or call.vaf < min_vaf:
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ## # If any of these coordinates have no more variants, then
        ## # remove the whole row.
        ## if call_remove:
        ##     chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ##     ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        ##     pos = [int(x) for x in pos]
        ##     coord2i = {}
        ##     for i, coord in enumerate(zip(chrom, pos, ref, alt)):
        ##         coord2i[coord] = i

        ##     for coord in call_remove:
        ##         num_remove = len(call_remove[coord])
        ##         num_calls = len(call_matrix.coord2samplecaller2call[coord])
        ##         assert num_remove <= num_calls
        ##         if num_remove == num_calls:
        ##             i = coord2i[coord]
        ##             I_remove[i] = 1

        ## # Make a matrix of the discarded rows.
        ## old_annot_matrix = var_matrix.annot_matrix
        ## old_named_matrices = var_matrix.named_matrices
        ## filtered_matrix = var_matrix
        ## x = AnnotationMatrix.rowslice(var_matrix.annot_matrix, I_remove)
        ## filtered_matrix.annot_matrix = x
        ## named_matrices = []
        ## for (name, matrix) in var_matrix.named_matrices:
        ##     matrix = AnnotationMatrix.rowslice(matrix, I_remove)
        ##     named_matrices.append((name, matrix))
        ## filtered_matrix.named_matrices = named_matrices
        ## SimpleVariantMatrix.write("discarded.txt", filtered_matrix)
        ## var_matrix.annot_matrix = old_annot_matrix
        ## var_matrix.named_matrices = old_named_matrices

        ## # Remove the calls.
        ## for coord in call_remove:
        ##     chrom, pos, ref, alt = coord
        ##     for (sample, caller) in call_remove[coord]:
        ##         var_matrix.call_matrix.set_call(
        ##             chrom, pos, ref, alt, sample, caller, None)

        ## # Which rows to keep.
        ## I_keep = [
        ##     i for i in range(var_matrix.num_variants()) if i not in I_remove]
        ## # Filter annotation matrix
        ## var_matrix.annot_matrix = AnnotationMatrix.rowslice(
        ##     var_matrix.annot_matrix, I_keep)
        ## # Filter named matrices.
        ## for i, (name, matrix) in enumerate(var_matrix.named_matrices):
        ##     matrix = AnnotationMatrix.rowslice(matrix, I_keep)
        ##     var_matrix.named_matrices[i] = (name, matrix)

        ## SimpleVariantMatrix.write(out_filename, var_matrix)

        return metadata