Esempio n. 1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import shutil
        from genomicode import filelib
        from genomicode import vcflib
        from Betsy import module_utils as mlib

        simple_file = in_data.identifier
        metadata = {}

        x = mlib.get_user_option(user_options, "remove_samples")
        x = x.split(",")
        x = [x.strip() for x in x]
        remove_samples = x

        x = mlib.get_user_option(user_options,
                                 "apply_filter",
                                 allowed_values=["no", "yes"])
        apply_filter = (x == "yes")

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])

        name2caller = {}  # name -> Caller object
        for caller in vcflib.CALLERS:
            caller = caller()
            assert caller.name not in name2caller
            name2caller[caller.name] = caller

        TEMPFILE = "temp.txt"
        handle = open(TEMPFILE, 'w')
        it = filelib.read_row(simple_file, header=1)
        print >> handle, "\t".join(it._header)
        for d in it:
            # Find the caller.
            assert d.Caller in name2caller, "Unknown caller: %s" % d.Caller
            caller = name2caller[d.Caller]

            # remove_sample
            if d.Sample in remove_samples:
                continue
            #if remove_radia_rna_samples and d.Sample.endswith("_RNA"):
            #    continue

            # apply_filter
            if apply_filter:
                args = d.Filter,
                if d.Caller == "MuSE":
                    args = d.Filter, wgs_or_wes
                if not caller.is_pass(*args):
                    continue

            print >> handle, "\t".join(d._cols)
        handle.close()

        shutil.move(TEMPFILE, out_filename)

        return metadata
Esempio n. 2
0
def subtract_mouse_reads(
    summary_file, in_fastq, out_fastq, sub_fastq, num_mismatches):
    # Accept this as a mouse read if it contains less than or equal to
    # num_mismatches mismatches from the mouse genome.
    from genomicode import filelib
    from genomicode import genomelib

    # List the reads that look like mouse.
    mouse_reads = {}
    for d in filelib.read_row(summary_file, header=1):
        if not d.NM:  # ignore missing alignments
            continue
        if int(d.NM) <= num_mismatches:
            mouse_reads[d.query_name] = 1

    outhandle = open(out_fastq, 'w')
    subhandle = open(sub_fastq, 'w')
    for x in genomelib.read_fastq(in_fastq):
        title, sequence, quality = x
        x = title
        if x.startswith("@"):
            x = x[1:]
        x = x.split()[0]  # BAM file only contains the first part.
        if x in mouse_reads:
            genomelib.write_fastq(title, sequence, quality, subhandle)
        else:
            genomelib.write_fastq(title, sequence, quality, outhandle)
Esempio n. 3
0
def annotate_firehose_methylation(filename, output):
    f = file(filename, 'r')
    text = f.readlines(2)
    f.close()
    handle = text[1].split('\t')
    assert handle[:5] == [
        'Composite Element REF', 'Beta_value', 'Gene_Symbol', 'Chromosome',
        'Genomic_Coordinate'
    ]
    f = file(filename, 'r')
    all_symbols = {}
    symbols = []
    for i in f.readlines():
        words = i.split('\t')
        symbol = words[2]
        symbols = symbol.split(";")
        for x in symbols:
            all_symbols[x] = 1
    f.close()
    all_symbols = sorted(all_symbols)
    #Look up all the symbols in the genefinder.
    symbol2id = {}
    genes = genefinder.find_many_genes(all_symbols, tax_id=9606)
    for (symbol, gene) in zip(all_symbols, genes):
        gene_id = gene[1]
        if gene_id is None:
            gene_id = ""
        symbol2id[symbol] = gene_id
    handle = filelib.read_row(filename, header=1)
    samples_names = [
        handle._header[i] for i in range(len(handle._header))
        if not (i - 1) % 4
    ]
    header = [
        "Probe.ID", "Gene.ID", "Gene.Symbol", "Chromosome",
        "Genomic.Coordinate"
    ] + samples_names
    f = file(output, 'w')
    f.write("\t".join(header) + '\n')
    with open(filename) as FileObj:
        for lines in FileObj:
            if lines.startswith('Hybridization REF') or lines.startswith(
                    'Composite Element REF'):
                continue
            items = lines.split('\t')
            probe_id = items[0]
            Gene_symbols = items[2]
            Chormosome = items[3]
            Genomic_coordinate = items[4]
            values = [items[i] for i in range(len(items)) if not (i - 1) % 4]
            symbols = Gene_symbols.split(";")
            ids = [symbol2id.get(x, "") for x in symbols]
            #gene_symbol = ";".join(symbols)
            gene_id = ";".join(map(str, ids))
            row = [
                probe_id, gene_id, Gene_symbols, Chormosome, Genomic_coordinate
            ] + values
            assert len(row) == len(header)
            f.write("\t".join(map(str, row)) + '\n')
    f.close()
Esempio n. 4
0
def format_firehose_rppa(filename, output):
    COMP_REF = "Composite.Element.REF"
    COMP_REF_H = "Composite_Element_REF"
    iter = filelib.read_row(filename, header=1)
    #assert iter._header[0] == COMP_REF
    f = file(output, 'w')
    header = ["Gene Symbol", "Gene ID", "Antibody"] + iter._header[1:]
    f.write("\t".join(header) + '\n')
    for d in iter:
        assert hasattr(d, COMP_REF_H)
        x = getattr(d, COMP_REF_H)
        x = x.split("|")
        assert len(x) == 2
        x, antibody = x
        gene_symbols = [x.strip() for x in x.split()]
        x = genefinder.find_many_genes(gene_symbols, tax_id="9606")
        x = [x[1] for x in x]
        x = [x for x in x if x]
        gene_ids = x
        if gene_symbols == ["CDC2"]:
            gene_ids = ["983"]

        gene_symbol_str = ";".join(gene_symbols)
        gene_id_str = ";".join(map(str, gene_ids))
        x = [gene_symbol_str, gene_id_str, antibody] + d._cols[1:]
        assert len(x) == len(header)
        f.write("\t".join(map(str, x)) + '\n')
    f.close()
def read_signatures(sigdb_path, desired_normalization, desired_tags):
    from genomicode import filelib
    
    opr = os.path.realpath
    opj = os.path.join

    filename = opj(sigdb_path, "signatures.txt")
    assert os.path.exists(filename), "Missing signatures.txt file."

    x = [x.upper() for x in desired_tags]
    desired_tags = {}.fromkeys(x)

    x = [x.upper() for x in desired_normalization]
    desired_normalization = {}.fromkeys(x)

    ds = []
    for d in filelib.read_row(filename, header=1):
        # Skip if not the right normalization.
        if d.Normalization.upper() not in desired_normalization:
            continue
        
        # Skip if the tags don't match.
        tags = [x.upper() for x in d.Tags.split()]
        for tag in tags:
            if tag in desired_tags:
                break
        else:
            # None of these tags matched any of the desired ones.
            continue
        
        # Skip if not all parameters supplied.
        if not d.Normalization:
            continue
        if not d.Genes or not d.Metagenes:
            continue
        if not d.Metagenes or not d.Quantile:
            continue
        if not d.Train0 or not d.Train1:
            continue

        # Find the training files.  If not found, then skip.
        train0 = opr(opj(sigdb_path, d.Train0))
        train1 = opr(opj(sigdb_path, d.Train1))
        if not os.path.exists(train0):
            train0 = train0 + ".gz"
        if not os.path.exists(train1):
            train1 = train1 + ".gz"
        if not os.path.exists(train0) or not os.path.exists(train1):
            continue
        d.Train0 = train0
        d.Train1 = train1
        
        # xls2txt converts all values to floats.
        d.xID = int(float(d.xID))
        d.Genes = int(float(d.Genes))
        d.Metagenes = int(float(d.Metagenes))
        ds.append(d)
    return ds
Esempio n. 6
0
def format_firehose_gistic(filename, output):
    f = file(output, 'w')
    iter = filelib.read_row(filename, header=1)
    header = ["Gene ID", "Gene Symbol"] + iter._header[2:]
    print >> f, "\t".join(header)
    for d in iter:
        gene_symbol = d.Gene_Symbol
        gene_id = d.Locus_ID
        x = [gene_id, gene_symbol] + d._cols[2:]
        assert len(x) == len(header)
        print >> f, "\t".join(map(str, x))
    f.close()
 def run(
     self, network, antecedents, out_attributes, user_options, num_cores,
     outfile):
     from genomicode import filelib
     import os
     import arrayio
     from genomicode import config
     from genomicode import arrayplatformlib
     in_data = antecedents
     mapfile = config.HumanHT_12_to_HG_u133_Plus_2
     assert os.path.exists(mapfile), 'mapping file %s does not exist' % mapfile
     result = []
     for d in filelib.read_row(mapfile, header=True):
         if int(d.Distance) <= 1000 and d.Match == 'Best for Both':
             result.append((d.Affymetrix_Probe_Set_ID, d.Illumina_Probe_ID))
 
     
     
     M = arrayio.read(in_data.identifier)
     #platform_list = arrayplatformlib.identify_all_platforms_of_matrix(M)
     platform_list = arrayplatformlib.score_all_platforms_of_matrix(M)
     illu_id = None
     probe_id = None
     for platform in platform_list:
         if 'HumanHT_12' in platform:
             illu_id = M._row_names[platform[0]]
         if 'HG_U133_Plus_2' in platform:
             probe_id = M._row_names[platform[0]]
 
     
     
     if not illu_id or not probe_id:
         return None
 
     
     
     index = []
     for i in range(M.nrow()):
         if (probe_id[i], illu_id[i]) in result:
             index.append(i)
 
     
     
     if len(index) > 0:
         M_new = M.matrix(index, None)
         f = file(outfile, 'w')
         arrayio.tab_delimited_format.write(M_new, f)
         f.close()
         assert filelib.exists_nz(outfile), (
             'the output file %s for best_match_both fails' % outfile
         )
     else:
         return None
def add_snpeff_to_svm(svm_file, snpeff_file, outfile):
    import shutil
    from genomicode import filelib
    from genomicode import SimpleVariantMatrix
    from genomicode import AnnotationMatrix

    if not filelib.exists_nz(snpeff_file):
        shutil.copy2(svm_file, outfile)
        return

    # Read the annotations.
    header = None  # includes Chrom, Pos, Ref, Alt
    coord2d = {}
    for d in filelib.read_row(snpeff_file, header=1):
        if header is None:
            header = d._header
        coord = d.Chrom, d.Pos, d.Ref, d.Alt
        coord2d[coord] = d

    svm = SimpleVariantMatrix.read_as_am(svm_file)
    CHROM = svm.header2annots["______Chrom"]
    POS = svm.header2annots["______Pos"]
    REF = svm.header2annots["______Ref"]
    ALT = svm.header2annots["______Alt"]

    snpeff_header = header[4:]
    snpeff_matrix = []  # Row major.
    for i in range(len(CHROM)):
        coord = CHROM[i], POS[i], REF[i], ALT[i]
        row = [""] * len(snpeff_header)
        d = coord2d.get(coord)
        if d:
            row = d._cols[4:]
        assert len(row) == len(snpeff_header)
        snpeff_matrix.append(row)
    assert len(snpeff_matrix) == len(CHROM)
    # AnnotationMatrix is column major.
    snpeff_annots = []
    for j in range(len(snpeff_header)):
        x = [snpeff_matrix[i][j] for i in range(len(snpeff_matrix))]
        snpeff_annots.append(x)
    # Convert the headers to SVM format.
    snpeff_header = ["SnpEff______%s" % x for x in snpeff_header]
    # Make the new SimpleVariantMatrix.
    headers = svm.headers[:4] + snpeff_header + svm.headers[4:]
    x = [svm.header2annots[x] for x in svm.headers_h]
    all_annots = x[:4] + snpeff_annots + x[4:]
    merged = AnnotationMatrix.create_from_annotations(
        headers, all_annots, headerlines=svm.headerlines)
    SimpleVariantMatrix.write_from_am(outfile, merged)
Esempio n. 9
0
def _read_insert_sizes(filename):
    from genomicode import filelib

    # Should be summary from Picard's insert size summary.
    sample2size = {}
    for d in filelib.read_row(filename, header=1):
        assert hasattr(d, "Sample"), \
               "Missing in summary: Sample"
        assert hasattr(d, "MEAN_INSERT_SIZE"), \
               "Missing in summary: MEAN_INSERT_SIZE"
        x = float(d.MEAN_INSERT_SIZE)
        size = int(round(x))
        assert size > 0 and size < 10000  # checking
        sample2size[d.Sample] = size
    return sample2size
Esempio n. 10
0
def _read_fragment_sizes(filename):
    from genomicode import filelib

    # Should be summary from Picard's alignment summary.
    sample2readlen = {}
    for d in filelib.read_row(filename, header=1):
        assert hasattr(d, "Sample"), \
               "Missing in summary: Sample"
        assert hasattr(d, "MEAN_READ_LENGTH"), \
               "Missing in summary: MEAN_READ_LENGTH"
        x = float(d.MEAN_READ_LENGTH)
        readlen = int(round(x))
        assert readlen > 0 and readlen < 10000  # checking
        sample2readlen[d.Sample] = readlen
    return sample2readlen
def find_significant_gene_sets(gsea_path, name1, name2, fdr_cutoff):
    # Return a tuple of:
    # 1.  list of gene sets associated with name1
    # 2.  list of gene sets associated with name2
    import os
    from genomicode import filelib

    assert fdr_cutoff > 0 and fdr_cutoff <= 1.0

    # gsea_report_for_NET1KO_IMM_1482443241364.xls
    # gsea_report_for_NET1KO_P2_1482443241364.xls
    # gsea_report_for_<name1>_<some number>.xls
    x = os.listdir(gsea_path)
    x = [x for x in x if x.startswith("gsea_report_for_")]
    x = [x for x in x if x.endswith(".xls")]
    assert x, "Could not find gsea_report"
    assert len(x) == 2, "Could not find 2 gsea_reports"
    report1, report2 = x

    # What if name1 is substring of name2?
    if report1.find(name2) >= 0 and report2.find(name1) >= 0:
        report1, report2 = report2, report1
    assert name1 in report1 and name2 in report2

    filename1 = os.path.join(gsea_path, report1)
    filename2 = os.path.join(gsea_path, report2)

    # Will try to read as an excel file.  Return blank result.
    ds1 = [d for d in filelib.read_row(open(filename1), header=1)]
    ds2 = [d for d in filelib.read_row(open(filename2), header=1)]
    ds1 = [d for d in ds1 if float(d.FDR_q_val) < fdr_cutoff]
    ds2 = [d for d in ds2 if float(d.FDR_q_val) < fdr_cutoff]

    gs1 = [d.NAME for d in ds1]
    gs2 = [d.NAME for d in ds2]
    return gs1, gs2
Esempio n. 12
0
def parse_bedtools_genomecov_results(filename):
    # Return list of (chromosome or "genome", coverage, bases with
    # coverage, size of chromosome, perc bases with coverage).
    # genome  0       18413   4392353 0.00419206
    # genome  1       17191   4392353 0.00391385
    # genome  2       19904   4392353 0.00453151
    # genome  3       27298   4392353 0.00621489
    # ...
    import filelib

    results = []
    for d in filelib.read_row(
            filename, "chrom:s depth:d num_bases:d chr_size:d perc_bases:f"):
        x = d.chrom, d.depth, d.num_bases, d.chr_size, d.perc_bases
        results.append(x)
    return results
Esempio n. 13
0
def _make_scatter(povray, pca_file, pov_file, out_file):
    from genomicode import filelib
    from genomicode import pcalib

    ds = [d for d in filelib.read_row(pca_file, header=1)]
    X = [float(d.PC_0) for d in ds]
    Y = [float(d.PC_1) for d in ds]
    #Z = [float(d.PC_2) for d in ds]
    DATASET = [int(d.Dataset) for d in ds]
    assert min(DATASET) >= 0 and max(DATASET) < 256

    x = pcalib.plot_scatter(X,
                            Y,
                            out_file,
                            group=DATASET,
                            pov_file=pov_file,
                            povray=povray)
    print x
    sys.stdout.flush()
    assert os.path.exists(out_file), "Failed to plot predictions."
Esempio n. 14
0
def read_normal_cancer_file(file_or_handle):
    # Return list of (normal_sample, tumor_sample).
    import os
    from genomicode import filelib

    handle = file_or_handle
    if type(handle) is type(""):
        assert os.path.exists(file_or_handle)
        handle = filelib.openfh(handle)

    data = []
    for d in filelib.read_row(handle, header=1, pad_cols=""):
        assert hasattr(d, "Normal"), "Missing header: Normal"
        assert hasattr(d, "Cancer"), "Missing header: Cancer"
        ns = d.Normal
        ts = d.Cancer
        ns, ts = ns.strip(), ts.strip()
        assert ns != ts
        x = ns, ts
        data.append(x)
    return data
Esempio n. 15
0
def summarize_probabilities(signatures, names, paths, file_layout):
    from genomicode import filelib
    from genomicode import hashlib

    _hash = hashlib.hash_sampleid
    
    sample_names = []   # list of sample names
    probabilities = []  # matrix of probabilities
    for i, sig in enumerate(signatures):
        name, outpath = names[i], paths[i]
        filename = os.path.join(outpath, "probabilities.txt")
        assert os.path.exists(filename), \
               "Could not find probability file for %s." % name
        ds = [d for d in filelib.read_row(filename, header=1)]
        ds = [d for d in ds if d.Type == "test"]

        # Assign and check the sample names.
        if not sample_names:
            sample_names = [d.Sample for d in ds]
        assert len(sample_names) == len(ds)
        for i, d in enumerate(ds):
            assert _hash(d.Sample) == _hash(sample_names[i])

        # Bug: What is there are nan's here?
        probs = [float(d.Probability) for d in ds]
        probabilities.append(probs)

    # Write out the probability file.
    handle = open(file_layout.PROBABILITIES_PCL, 'w')
    x = ["SigID", "NAME"] + sample_names
    print >>handle, "\t".join(x)
    for i, sig in enumerate(signatures):
        name = sig.Name
        if getattr(sig, "Changed", False):
            name = "%s*" % name
        # If the signature was modified, make a notation.
        x = [sig.xID, name] + probabilities[i]
        print >>handle, "\t".join(map(str, x))
    handle.close()
Esempio n. 16
0
def annotate_linked_variants(MATRIX, args):
    if not args:
        return MATRIX
    from genomicode import filelib
    from genomicode import AnnotationMatrix

    link_file = args
    filelib.assert_exists_nz(link_file)
    coord2perc = {}
    for d in filelib.read_row(link_file, header=1):
        chrom = d.Chrom
        pos = int(d.Pos)
        perc = float(d.Perc_Linked)
        coord2perc[(chrom, pos)] = perc

    chrom = MATRIX.header2annots["______Chrom"]
    pos = MATRIX.header2annots["______Pos"]
    pos = [int(x) for x in pos]

    link_score = [""] * len(chrom)
    for i in range(len(chrom)):
        link_score[i] = coord2perc.get((chrom[i], pos[i]), "")

    # Add after:
    # Chrom, Pos, Ref, Alt
    header = "Linkage______Score"
    assert header not in MATRIX.headers
    headers = MATRIX.headers[:4] + [header] + MATRIX.headers[4:]
    all_annots = []
    for h in headers:
        if h != header:
            x = MATRIX[h]
        else:
            x = link_score
        all_annots.append(x)
    return AnnotationMatrix.create_from_annotations(headers, all_annots,
                                                    MATRIX.headerlines)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        from genomicode import filelib
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix

        simple_file = in_data.identifier
        metadata = {}

        # Read all in memory.  Hopefully, not too big.
        ds = []
        for d in filelib.read_row(simple_file, header=-1):
            ds.append(d)
            #if len(ds) > 50000:  # DEBUG
            #    break

        # MuSE sometimes has alternates.
        # Alt       A,C
        # Num_Alt  13,0
        # VAF      0.19,0.0
        # Detect this and fix it.  Take the alternate with the highest VAF.
        for d in ds:
            if d.Num_Alt.find(",") < 0:
                continue
            x1 = d.Num_Alt.split(",")
            x2 = d.VAF.split(",")
            assert len(x1) == len(x2)
            x1 = map(int, x1)
            x2 = map(float, x2)
            max_vaf = max_i = None
            for i in range(len(x2)):
                if max_vaf is None or x2[i] > max_vaf:
                    max_vaf = x2[i]
                    max_i = i
            assert max_i is not None
            d.Num_Alt = str(x1[max_i])
            d.VAF = str(x2[max_i])

        # Make a list of all the positions.
        positions = {}  # (Chrom, Pos) -> 1
        for d in ds:
            positions[(d.Chrom, int(d.Pos))] = 1
        positions = sorted(positions)

        # Make a list of all the callers.
        callers = {}
        for d in ds:
            callers[d.Caller] = 1
        callers = sorted(callers)

        # Make a list of all the samples.
        samples = {}
        for d in ds:
            samples[d.Sample] = 1
        samples = sorted(samples)

        # Make a list of the coordinates.
        coord_data = {}
        for d in ds:
            x = d.Chrom, int(d.Pos), d.Ref, d.Alt
            coord_data[x] = 1
        coord_data = sorted(coord_data)

        # Make a list of all DNA calls.
        call_data = []
        for d in ds:
            assert d.Source in ["DNA", "RNA"]
            if d.Source != "DNA":
                continue
            num_ref = num_alt = vaf = None
            if d.Num_Ref:
                num_ref = int(d.Num_Ref)
            if d.Num_Alt:
                num_alt = int(d.Num_Alt)
            if d.VAF:
                vaf = float(d.VAF)
            if num_ref is None and num_alt is None and vaf is None:
                continue
            call = SimpleVariantMatrix.Call(num_ref, num_alt, vaf)
            x = d.Chrom, int(d.Pos), d.Ref, d.Alt, d.Sample, d.Caller, call
            call_data.append(x)

        # sample -> caller -> chrom, pos, ref, alt -> call
        samp2caller2coord2call = {}
        for x in call_data:
            chrom, pos, ref, alt, sample, caller, call = x
            coord = chrom, pos, ref, alt
            if sample not in samp2caller2coord2call:
                samp2caller2coord2call[sample] = {}
            caller2coord2call = samp2caller2coord2call[sample]
            if caller not in caller2coord2call:
                caller2coord2call[caller] = {}
            coord2call = caller2coord2call[caller]
            # A (sample, caller, coord) may have multiple calls.  For
            # example, for germline samples that are called with each
            # tumor sample.  If this is the case, then take the call
            # with the highest coverage.
            if coord in coord2call:
                old_call = coord2call[coord]
                cov = old_cov = None
                if call.num_ref is not None and call.num_alt is not None:
                    cov = call.num_ref + call.num_alt
                if old_call.num_ref is not None and \
                       old_call.num_alt is not None:
                    old_cov = old_call.num_ref + old_call.num_alt
                if cov is None and old_cov is not None:
                    call = old_call
                elif cov is not None and old_cov is not None and cov < old_cov:
                    call = old_call
            coord2call[coord] = call

        # Count the number of callers that called a variant at each
        # position for each sample.
        samp2coord2caller = {}  # sample -> chrom, pos, ref, alt -> caller -> 1
        # Need to do this first, to make sure each caller is counted
        # at most once.  This is to account for germline samples that
        # is called by each caller multiple times.
        for x in call_data:
            chrom, pos, ref, alt, sample, caller, call = x
            coord = chrom, pos, ref, alt
            if sample not in samp2coord2caller:
                samp2coord2caller[sample] = {}
            if coord not in samp2coord2caller[sample]:
                samp2coord2caller[sample][coord] = {}
            samp2coord2caller[sample][coord][caller] = 1
        samp2coord2nc = {}  # sample -> chrom, pos, ref, alt -> num_callers
        for sample in samp2coord2caller:
            samp2coord2nc[sample] = {}
            for coord in samp2coord2caller[sample]:
                samp2coord2nc[sample][coord] = len(
                    samp2coord2caller[sample][coord])
        #for x in call_data:
        #    chrom, pos, ref, alt, sample, caller, call = x
        #    coord = chrom, pos, ref, alt
        #    if sample not in samp2coord2nc:
        #        samp2coord2nc[sample] = {}
        #    nc = samp2coord2nc[sample].get(coord, 0) + 1
        #    samp2coord2nc[sample][coord] = nc

        # Format everything into an annotation matrix.
        headers0 = []
        headers1 = []
        headers2 = []
        all_annots = []

        # Add the positions.
        headers0 += ["", "", "", ""]
        headers1 += ["", "", "", ""]
        headers2 += ["Chrom", "Pos", "Ref", "Alt"]
        for i in range(4):
            x = [x[i] for x in coord_data]
            x = [str(x) for x in x]
            all_annots.append(x)

        # Add the number of callers information.
        headers0 += ["Num Callers"] * len(samples)
        headers1 += [""] * len(samples)
        headers2 += samples
        for sample in samples:
            annots = []
            for coord in coord_data:
                nc = samp2coord2nc.get(sample, {}).get(coord, "")
                annots.append(nc)
            all_annots.append(annots)

        # Add information about calls.
        for sample in samples:
            caller2coord2call = samp2caller2coord2call.get(sample, {})
            for i, caller in enumerate(callers):
                h0 = ""
                if not i:
                    h0 = sample
                h1 = caller
                h2 = "Ref/Alt/VAF"
                headers0.append(h0)
                headers1.append(h1)
                headers2.append(h2)

                coord2call = caller2coord2call.get(caller, {})
                annots = []
                for coord in coord_data:
                    x = ""
                    call = coord2call.get(coord)
                    if call:
                        x = SimpleVariantMatrix._format_call(call)
                    annots.append(x)
                all_annots.append(annots)

        # Set the headers.
        assert len(headers0) == len(headers1)
        assert len(headers0) == len(headers2)
        assert len(headers0) == len(all_annots)
        headers = [None] * len(headers0)
        for i, x in enumerate(zip(headers0, headers1, headers2)):
            x = "___".join(x)
            headers[i] = x
        matrix = AnnotationMatrix.create_from_annotations(headers, all_annots)
        SimpleVariantMatrix.write_from_am(out_filename, matrix)

        #annot_header = ["Chrom", "Pos", "Ref", "Alt"]
        #matrix = SimpleVariantMatrix.make_matrix(
        #    samples, callers, annot_header, coord_data, named_data,
        #    call_data)
        #SimpleVariantMatrix.write(out_filename, matrix)

        return metadata
Esempio n. 18
0
def read_signatures(
    sigdb_path, desired_normalization, desired_ids, desired_tags):
    # Read the signatures and return the ones that match the
    # specifications.  Always obey desired_normalization.  If
    # desired_ids is specified, then do the ones with that ID and
    # ignore desired_tags.  Otherwise, obey desired_tags.
    
    from genomicode import filelib
    
    opr = os.path.realpath
    opj = os.path.join

    filename = opj(sigdb_path, "signatures.txt")
    assert os.path.exists(filename), "Missing signatures.txt file."

    x = [x.upper() for x in desired_normalization]  # case insensitive
    desired_normalization = {}.fromkeys(x)

    x = [x.upper() for x in desired_ids]    # case insensitive
    desired_ids = {}.fromkeys(x)

    original_tags = desired_tags[:]
    x = [x.upper() for x in desired_tags]   # case insensitive
    desired_tags = {}.fromkeys(x)

    tags_in_db = {}
    ds = []
    for d in filelib.read_row(filename, header=1):
        # xls2txt converts all values to floats.  Convert them back to
        # integers.
        d.xID = int(float(d.xID))
        d.Genes = int(float(d.Genes))
        d.Metagenes = int(float(d.Metagenes))

        # Skip if not the right normalization.
        if d.Normalization.upper() not in desired_normalization:
            continue

        # If the IDs are specified, then make sure the id matches.  If
        # no IDs are specified, then make sure the tags match.
        tags = [x.upper() for x in d.Tags.split()]
        for tag in tags:
            tags_in_db[tag] = 1
        if desired_ids:
            if str(d.xID) not in desired_ids:
                continue
        else:
            for tag in tags:
                if tag in desired_tags:
                    break
            else:
                # None of these tags matched any of the desired ones.
                continue
        
        # Skip if not all parameters supplied.
        if not d.Normalization:
            continue
        if not d.Genes or not d.Metagenes:
            continue
        if not d.Metagenes or not d.Quantile:
            continue
        if not d.Train0 or not d.Train1:
            continue

        # Find the training files.  If not found, then skip.
        train0 = opr(opj(sigdb_path, d.Train0))
        train1 = opr(opj(sigdb_path, d.Train1))
        if not os.path.exists(train0):
            train0 = train0 + ".gz"
        if not os.path.exists(train1):
            train1 = train1 + ".gz"
        if not os.path.exists(train0) or not os.path.exists(train1):
            continue
        d.Train0 = train0
        d.Train1 = train1

        ds.append(d)

    # Check to see if all the tags the user specified are valid.
    for tag in original_tags:
        assert tag.upper() in tags_in_db, "Unknown tag: %s" % tag
        
    return ds
Esempio n. 19
0
def read_sample_group_file(file_or_handle):
    # Return list of (filename, sample, pair).  pair is None, 1, or 2.
    # filename is a relative path.
    #
    # Reads can be split across multiple files (e.g. for multiple
    # lanes), or across pairs.
    # Headers:
    # Filename  Sample  Pair
    # F1         A       1
    # F3         A       2
    # F2         A       1
    # F4         A       2
    # F5         B       1
    # F6         B       2
    #
    # - Filenames should be unique.
    # - Filename should be relative.  No full path information.
    # - Pair should be 1 or 2.  If single end reads, just leave blank.
    # - There can be many Filenames per Sample.  There can be many
    #   Pairs per Sample (if the reads for one pair are split).
    # - The pairs that match (1 to its 2 partner) should be next to
    #   each other in the file.
    import os
    from genomicode import filelib

    handle = file_or_handle
    if type(handle) is type(""):
        assert os.path.exists(file_or_handle)
        handle = filelib.openfh(handle)

    data = []
    for d in filelib.read_row(handle, header=1, pad_cols=""):
        assert hasattr(d, "Pair"), "Missing column: Pair"
        pair = d.Pair.strip()
        assert pair in ["", "1", "2"], "Invalid pair: %s" % d.Pair
        x = d.Filename, d.Sample, pair
        data.append(x)

    # Make sure filenames are unique.
    seen = {}
    for x in data:
        filename, sample, pair = x
        x1, x2 = os.path.split(filename)
        assert not x1, "Filename should not contain a path: %s" % filename
        assert filename not in seen, "Filename is not unique: %s" % filename
        seen[filename] = 1

    # If all the Pairs are "1", then make them all blank.
    x = [x[-1] for x in data]
    x = sorted({}.fromkeys(x))
    if x == ["1"]:
        for i in range(len(data)):
            filename, sample, pair = data[i]
            data[i] = filename, sample, ""

    # For each sample, make sure there isn't a mix of paired and
    # single ended files.  It must be all single ended or all paired.
    x = [x[1] for x in data]
    all_samples = sorted({}.fromkeys(x))
    for sample in all_samples:
        x = [x[2] for x in data if x[1] == sample]
        x = sorted({}.fromkeys(x))
        if x == [""] or x == ["1"]:  # All single
            continue
        elif x == ["1", "2"]:  # All paired
            continue
        raise AssertionError, "Weird pairing [%s]: %s" % (repr(x), sample)

    # Make sure each pair is next to each other.
    for sample in all_samples:
        pairs = [x[2] for x in data if x[1] == sample]
        # Should be all "", or a pattern of "1", "2".
        x = sorted({}.fromkeys(pairs))
        if x == [""] or x == ["1"]:  # all ""
            continue
        assert len(x) % 2 == 0, "Weird pairing: %s" % sample
        for i in range(0, len(x), 2):
            assert x[i] == "1", "Weird pairing: %s" % sample
            assert x[i + 1] == "2", "Weird pairing: %s" % sample

    return data
def summarize_matches_file(filename,
                           fastq_file1,
                           fastq_file2,
                           num_mismatches,
                           temp_path=None,
                           outfile=None):
    # Return dictionary with keys:
    # total_alignments       int
    # perfect_alignments     int
    # perc_perfect           float (0.0-1.0)    Technically, fraction not %
    # Will create temporary files in temp_path.  These files could be
    # big (similar in size to the fastq files), so this should
    # ideally be a path with a lot of free space.
    import os
    import tempfile
    import gdbm
    import json
    from genomicode import filelib
    from genomicode import genomelib

    if temp_path is None:
        temp_path = "."

    # Get the list of all possible read names from fastq_file1.
    # Title from fastq file:
    #   @ST-J00106:107:H5NK2BBXX:1:1101:1438:1173 1:N:0:NAGATC
    # From alignment file:
    #   ST-J00106:107:H5NK2BBXX:1:2218:22079:11653

    IN_MEMORY = True

    temp_filename = None
    all_aligns = None
    try:
        if IN_MEMORY:
            # This can take a lot of memory.
            all_aligns = {}
            null_value = 0
        else:
            # To minimize the use of memory, store in a database.
            # This is really slow.
            x, temp_filename = tempfile.mkstemp(dir=temp_path)
            os.close(x)
            all_aligns = gdbm.open(temp_filename, "nf")
            null_value = "0"

        for x in genomelib.read_fastq(fastq_file1):
            title, sequence, quality = x
            x = title
            if x.startswith("@"):
                x = x[1:]
            x = x.split()[0]  # alignment file only contains the first part.
            all_aligns[x] = null_value

        # Keep track of the ones that I've seen before to make sure
        # we don't double count.
        #perfect_aligns = {}
        perfect = 0
        for d in filelib.read_row(filename, header=1):
            # This check makes the function very slow.
            assert d.query_name in all_aligns
            # Skip unaligned reads.
            if not d.NM:
                continue
            if int(d.NM) > num_mismatches:
                continue
            num_seen = int(all_aligns[d.query_name])
            if num_seen == 0:
                perfect += 1
            num_seen += 1
            if not IN_MEMORY:
                num_seen = str(num_seen)
            all_aligns[d.query_name] = num_seen
            #perfect += 1
            #perfect_aligns[d.query_name] = 1
        #perfect = len(perfect_aligns)
        total = len(all_aligns)
    finally:
        if all_aligns and not IN_MEMORY:
            all_aligns.close()
        # dbm will create files:
        # <temp_file>
        # <temp_file>.dir
        # <temp_file>.pag
        # Delete them all.
        if temp_filename is not None:
            temp_file = os.path.split(temp_filename)[1]
            x = os.listdir(temp_path)
            x = [x for x in x if x.startswith(temp_file)]
            x = [os.path.join(temp_path, x) for x in x]
            for x in x:
                os.unlink(x)
        #if os.path.exists(temp_filename):
        #    os.unlink(temp_filename)

    results = {
        "perfect_alignments": perfect,
        "total_alignments": total,
        "perc_perfect": float(perfect) / total,
    }
    if outfile is not None:
        x = json.dumps(results)
        open(outfile, 'w').write(x)
    return results
Esempio n. 21
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import math
        from genomicode import filelib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        svm_node = in_data
        filelib.assert_exists_nz(svm_node.identifier)

        linked_file = mlib.get_user_option(user_options,
                                           "linked_variants_file",
                                           not_empty=True,
                                           check_file=True)

        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier)
        CHROM = SVM["______Chrom"]
        POS = SVM["______Pos"]
        POS = [int(x) for x in POS]
        all_coords = {}  # (chrom, pos) -> 1
        for x in zip(CHROM, POS):
            all_coords[x] = 1

        # Read the linked variant file.
        # Chrom  Pos  Perc Linked  p
        coord2info = {}  # (chrom, pos) -> d
        for d in filelib.read_row(linked_file, header=1):
            pos = int(d.Pos)
            if (d.Chrom, pos) not in all_coords:
                continue
            coord2info[(d.Chrom, pos)] = d

        # Align the linked annotations to the matrix.
        MAX_SCORE = 1000
        min_p = 10**-(MAX_SCORE / 10)
        linked_headers = ["Perc Linked", "Score"]
        annotations = []
        for (chrom, pos) in zip(CHROM, POS):
            if (chrom, pos) not in coord2info:
                x = [""] * len(linked_headers)
                annotations.append(x)
                continue
            d = coord2info[(chrom, pos)]
            score = MAX_SCORE
            if float(d.p) >= min_p:
                score = -10 * math.log(float(d.p), 10)
            x = d.Perc_Linked, score
            assert len(x) == len(linked_headers)
            annotations.append(x)
        # Convert the headers and annotations to SVM format.
        linked_headers = ["Linkage______%s" % x for x in linked_headers]
        linked_annotations = jmath.transpose(annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        ## If Annovar exists, put after.
        #I = [i for (i, x) in enumerate(SVM.headers)
        #     if x.upper().startswith("ANNOVAR")]
        #if I:
        #    INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + linked_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + linked_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)
def add_coverage_to_svm(svm_file, coverage_file, outfile, is_rna_cov):
    from genomicode import jmath
    from genomicode import filelib
    from genomicode import AnnotationMatrix
    from genomicode import SimpleVariantMatrix
    
    # Read the variant file.
    SVM = SimpleVariantMatrix.read(svm_file)
    AM = SVM.annot_matrix
    assert "Chrom" in AM.headers
    assert "Pos" in AM.headers
    CHROM = AM["Chrom"]
    POS = AM["Pos"]
    POS = [int(x) for x in POS]

    # Read the coverage matrix.
    # Chrom  Pos  <Sample>  [<Sample> ...]
    # Pos is 1-based.
    coord2sample2cov = {}  # (chrom, pos) -> sample -> ref/alt/vaf
    cov_samples = {}
    for d in filelib.read_row(coverage_file, header=1):
        coord = d.Chrom, int(d.Pos)
        if coord not in coord2sample2cov:
            coord2sample2cov[coord] = {}
        for i in range(2, len(d._header)):
            sample = d._header[i]
            cov = d._cols[i]
            if not cov:
                continue
            #coord2sample2cov[coord][sample] = int(cov)
            coord2sample2cov[coord][sample] = cov
            cov_samples[sample] = 1

    # Make sure the samples from the variant matrix can be found
    # in the coverage matrix.
    missing = [x for x in SVM.samples if x not in cov_samples]
    assert len(missing) < len(SVM.samples), (
        "SimpleVariantMatrix and coverage file have "
        "no common samples.")
    # If the samples aren't sequenced at high coverage, it's
    # possible they just don't have reads at these positions.  Be
    # a little lenient here, and accept the file if some of the
    # samples overlap.
    #x = missing
    #if len(x) > 5:
    #    x = x[:5] + ["..."]
    #msg = "Samples (%d) not found in coverage file: %s" % (
    #    len(missing), ", ".join(x))
    #assert not missing, msg
    # Report the coverage for the samples at the intersection.
    SAMPLES = [x for x in SVM.samples if x in cov_samples]

    # Align the matrix to the simple variant matrix.
    #matrix = [[None]*len(SVM.samples) for i in range(AM.num_annots())]
    matrix = [[None]*len(SAMPLES) for i in range(AM.num_annots())]
    for i in range(AM.num_annots()):
        coord = CHROM[i], POS[i]
        sample2cov = coord2sample2cov.get(coord, {})
        x = [sample2cov.get(x, "") for x in SAMPLES]
        #x = map(str, x)
        matrix[i] = x

    # Add the matrix back to the simple variant matrix.
    headers = SAMPLES
    all_annots = jmath.transpose(matrix)
    name = "Coverage"
    # If this is being used to add RNA coverage, use a different
    # name.
    if is_rna_cov:
        name = "RNA Coverage"
    x = AnnotationMatrix.create_from_annotations(headers, all_annots)
    SVM.named_matrices.append((name, x))

    # Write to file.
    SimpleVariantMatrix.write(outfile, SVM)
def clean_snpeff_file(infile, outfile):
    import os
    from genomicode import filelib
    from genomicode import sortlib

    if not os.path.exists(infile):
        return

    # Chrom
    # Pos
    # Ref
    # Alt
    # Allele                    Multiple Feature_ID per Allele.
    # Annotation
    # Annotation_Impact
    # Gene_Name                 Unique, given coord and Feature_ID.
    # Gene_ID                   Unique, given coord and Feature_ID.
    # Feature_Type              Unique, given coord and Feature_ID.
    # Feature_ID                Can have multiple Feature_ID per coord.
    # Transcript_BioType        Unique, given coord and Feature_ID.
    # Rank
    # HGVS.c
    # HGVS.p
    # cDNA.pos / cDNA.length
    # CDS.pos / CDS.length
    # AA.pos / AA.length
    # Distance
    # ERRORS / WARNINGS / INFO  Unique, given coord and Feature_ID.

    if False:  # For debugging
        # Check to see which columns are unique given the same coordinate.
        coord2header2values = {}  # coord -> header -> list of values
        for d in filelib.read_row(infile, header=1):
            coord = d.Chrom, d.Pos, d.Ref, d.Alt, d.Feature_ID
            if coord not in coord2header2values:
                coord2header2values[coord] = {}
            header2values = coord2header2values[coord]
            for header in d._nheader:
                if header in ["Chrom", "Pos", "Ref", "Alt", "Allele"]:
                    continue
                value = getattr(d, header)
                if header not in header2values:
                    header2values[header] = []
                if value not in header2values[header]:
                    header2values[header].append(value)
        not_unique = {}  # header -> 1
        for coord, header2values in coord2header2values.iteritems():
            for header, values in header2values.iteritems():
                if len(values) > 1:
                    not_unique[header] = 1
        for x in sorted(not_unique):
            print x

    # Just merge each of these annotations using commas.
    coord2ds = {}  # (chrom, pos, ref, alt) -> list of d
    header = None
    for d in filelib.read_row(infile, header=1):
        if not header:
            header = d._header
        coord = d.Chrom, d.Pos, d.Ref, d.Alt
        coord2ds.setdefault(coord, []).append(d)
    # Sort each record by Allele, Gene_Name, Feature_Type, Feature_ID
    for coord, ds in coord2ds.iteritems():
        schwartz = [(d.Allele, d.Gene_Name, d.Feature_Type, d.Feature_ID, d)
                    for d in ds]
        schwartz.sort()
        ds = [x[-1] for x in schwartz]
        coord2ds[coord] = ds
    # Convert ds to matrix of values
    coord2matrix = {}  # coord -> list of lists
    for coord, ds in coord2ds.iteritems():
        matrix = [d._cols for d in ds]
        coord2matrix[coord] = matrix
    # Merge the values.
    DELIM = ","
    coord2row = {}  # coord -> list
    for coord, matrix in coord2matrix.iteritems():
        row = []
        # Coordinate should be unique.
        for j in range(4):
            for i in range(len(matrix)):
                assert matrix[i][j] == matrix[0][j]
            row.append(matrix[0][j])
        assert len(matrix)
        for j in range(4, len(matrix[0])):
            x = [matrix[i][j] for i in range(len(matrix))]
            for y in x:
                assert DELIM not in y
            # If every value is the same, then just use the first value.
            z = sorted(x)
            if z[0] == z[-1]:
                x = [z[0]]
            x = ",".join(x)
            row.append(x)
        assert len(row) == len(header)
        coord2row[coord] = row
    # Write out each of the rows.
    h = sortlib.hash_natural
    all_coords = coord2row.keys()
    schwartz = [(h(x[0]), h(x[1]), x[2], x[3], x) for x in all_coords]
    schwartz.sort()
    all_coords = [x[-1] for x in schwartz]
    handle = open(outfile, 'w')
    print >> handle, "\t".join(header)
    for coord in all_coords:
        x = coord2row[coord]
        assert len(x) == len(header)
        print >> handle, "\t".join(map(str, x))
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        #import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        summary_node = in_data
        summary_filename = summary_node.identifier
        metadata = {}

        buildver = mlib.get_user_option(user_options,
                                        "annovar_buildver",
                                        allowed_values=["hg19"],
                                        not_empty=True)

        # Name files.
        p, root, ext = mlib.splitpath(summary_filename)
        annovar_infile = "pos.txt"
        log_filename = "annovar.log"
        # Annovar takes a filestem, without the ".vcf".
        annovar_outstem = "annotations"
        # Produces file:
        # <annovar_outstem>.hg19_multianno.txt
        multianno_file = "%s.hg19_multianno.txt" % annovar_outstem
        #temp_file = "temp.txt"

        # Make the infile for Annovar.
        # <chrom> <start> <end> <ref> <alt>
        handle = open(annovar_infile, 'w')
        for d in filelib.read_row(summary_filename, skip=2, header=1):
            x = d.Chrom, d.Pos, d.Pos, d.Ref, d.Alt
            print >> handle, "\t".join(x)
        handle.close()

        cmd = alignlib.make_annovar_command(annovar_infile,
                                            log_filename,
                                            annovar_outstem,
                                            buildver,
                                            vcf_input=False)
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]

        filelib.assert_exists_nz(log_filename)
        filelib.assert_exists_nz(multianno_file)

        matrix = SimpleVariantMatrix.read(summary_filename)
        annot_matrix = matrix.annot_matrix
        #headers = annot_matrix.headers + anno_header[5:]
        chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        pos = [int(x) for x in pos]

        # Read in the multianno output file.
        pos2d = {}  # (chrom, start, ref, alt) -> d
        anno_header = None
        for d in filelib.read_row(multianno_file, header=1):
            key = d.Chr, int(d.Start), d.Ref, d.Alt
            assert key not in pos2d, "Duplicate pos: %s" % str(key)
            pos2d[key] = d
            if not anno_header:
                anno_header = d._header
        assert anno_header

        # Multianno starts with:
        # Chr Start End Ref Alt
        # Ignore these.
        assert anno_header[:5] == ["Chr", "Start", "End", "Ref", "Alt"]
        headers = anno_header[5:]

        all_annots = []
        #for h in annot_matrix.headers_h:
        #    x = annot_matrix.header2annots[h]
        #    all_annots.append(x)
        for i in range(5, len(anno_header)):
            annots = []
            for coord in zip(chrom, pos, ref, alt):
                d = pos2d.get(coord)
                x = ""
                if d:
                    x = d._cols[i]
                annots.append(x)
            all_annots.append(annots)
        x = AnnotationMatrix.create_from_annotations(headers, all_annots)
        matrix.named_matrices.insert(0, ("Annovar", x))

        SimpleVariantMatrix.write(out_filename, matrix)

        ## cols_to_add = len(anno_header) - 5
        ## assert cols_to_add > 0

        ## # Merge the multianno file with the simple call summary.  Add
        ## # these columns before the <Sample>.
        ## # Sample                <Sample>
        ## # Caller                <Caller>
        ## # Chrom  Pos  Ref  Alt  Ref/Alt/VAF
        ## handle = open(temp_file, 'w')
        ## it = filelib.read_cols(summary_filename)
        ## header1 = it.next()
        ## header2 = it.next()
        ## header3 = it.next()
        ## assert len(header1) == len(header2), "%d %d %d %s" % (
        ##     len(header1), len(header2), len(header3), summary_filename)
        ## assert len(header1) == len(header3), "%d %d %d %s" % (
        ##     len(header1), len(header2), len(header3), summary_filename)
        ## assert header1[0] == "Sample"
        ## assert header2[0] == "Caller"
        ## assert header3[:4] == ["Chrom", "Pos", "Ref", "Alt"]
        ## header1 = header1[:4] + [""]*cols_to_add + header1[4:]
        ## header2 = header2[:4] + [""]*cols_to_add + header2[4:]
        ## header3 = header3[:4] + anno_header[5:] + header3[4:]
        ## print >>handle, "\t".join(header1)
        ## print >>handle, "\t".join(header2)
        ## print >>handle, "\t".join(header3)
        ## for cols in it:
        ##     chrom, pos, ref, alt = cols[:4]
        ##     pos = int(pos)
        ##     d = pos2d.get((chrom, pos))
        ##     if not d:
        ##         cols = cols[:4] + [""]*cols_to_add + cols[4:]
        ##         continue
        ##     assert ref == d.Ref, "%s %s %s %s %s %s" % (
        ##         chrom, pos, ref, alt, d.Ref, d.Alt)
        ##     assert alt == d.Alt, "%s %s %s %s %s %s" % (
        ##         chrom, pos, ref, alt, d.Ref, d.Alt)
        ##     x = d._cols[5:]
        ##     assert len(x) == cols_to_add
        ##     cols = cols[:4] + x + cols[4:]
        ##     print >>handle, "\t".join(cols)
        ## handle.close()

        ## shutil.move(temp_file, out_filename)

        return metadata
Esempio n. 25
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import hashlib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        simple_node = in_data
        filelib.assert_exists_nz(simple_node.identifier)

        gene_file = mlib.get_user_option(
            user_options, "cancer_genes_file", not_empty=True, check_file=True)

        # Read the cancer genes file.
        # <Gene ID>  <Gene Symbol>  <Dataset>  ...
        symbol2info = {}  # symbol -> d
        gene_iter = filelib.read_row(gene_file, header=1)
        header = None
        for d in gene_iter:
            assert "Gene Symbol" in d._header
            if header is None:
                header = [
                    x for x in d._header
                    if x not in ["Gene ID", "Gene Symbol"]]
            if not d.Gene_Symbol:
                continue
            symbol2info[d.Gene_Symbol] = d

        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(simple_node.identifier)

        GENE_H = "Annovar______Gene.refGene"
        assert GENE_H in SVM.headers, "Missing annotation: %s" % GENE_H
        GENES = SVM[GENE_H]

        # Align the matrix to the simple variant matrix.
        gene_headers = header
        gene_annotations = []
        for i, gene_str in enumerate(GENES):
            # Format of genes:
            # PFN1P2
            # PMS2P2,PMS2P7
            values = [""] * len(gene_headers)
            genes = gene_str.split(",")
            for gene in genes:
                if gene not in symbol2info:
                    continue
                d = symbol2info[gene]
                for j, h in enumerate(gene_headers):
                    h = hashlib.hash_var(h)
                    assert hasattr(d, h)
                    x = getattr(d, h)
                    assert x in ["", "1"]
                    if x == "1":
                        values[j] = 1
            gene_annotations.append(values)
        # Convert the headers and annotations to SVM format.
        gene_headers = ["Cancer Genes______%s" % x for x in gene_headers]
        gene_annotations = jmath.transpose(gene_annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        # If Annovar exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("ANNOVAR")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If SnpEff exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("SNPEFF")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If COSMIC exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("COSMIC")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + gene_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + gene_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        svm_node = in_data
        filelib.assert_exists_nz(svm_node.identifier)

        cosmic_file = mlib.get_user_option(
            user_options, "cosmic_variants_file", not_empty=True,
            check_file=True)
        
        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier)
        CHROM = SVM["______Chrom"]
        POS = SVM["______Pos"]
        POS = [int(x) for x in POS]
        all_coords = {}  # (chrom, pos) -> 1
        for x in zip(CHROM, POS):
            all_coords[x] = 1

        # Read the COSMIC variant file.
        # Chrom  Start  End  GRCh  Count  SNP
        # Mutation CDS  Mutation AA
        # FATHMM prediction  FATHMM score  Mutation somatic status
        coord2info = {}  # (chrom, pos) -> d
        for d in filelib.read_row(cosmic_file, header=1):
            start, end = int(d.Start), int(d.End)
            in_svm = False
            for pos in range(start, end+1):
                if (d.Chrom, pos) in all_coords:
                    in_svm = True
                    break
            if not in_svm:
                continue
            coord2info[(d.Chrom, pos)] = d

        # Align the COSMIC annotations to the matrix.
        cosmic_headers = [
            "SNP", "Num Tumors", "Mutation CDS", "Mutation AA",
            "FATHMM prediction", "FATHMM score", "Mutation somatic status"]
        annotations = []
        for (chrom, pos) in zip(CHROM, POS):
            if (chrom, pos) not in coord2info:
                x = [""] * len(cosmic_headers)
                annotations.append(x)
                continue
            d = coord2info[(chrom, pos)]
            x = d.SNP, d.Count, d.Mutation_CDS, d.Mutation_AA, \
                d.FATHMM_prediction, d.FATHMM_score, \
                d.Mutation_somatic_status
            annotations.append(x)
        # Convert the headers and annotations to SVM format.
        cosmic_headers = ["COSMIC______%s" % x for x in cosmic_headers]
        cosmic_annotations = jmath.transpose(annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        # If Annovar exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("ANNOVAR")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        # If SnpEff exists, put after.
        I = [i for (i, x) in enumerate(SVM.headers)
             if x.upper().startswith("SNPEFF")]
        if I:
            INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + cosmic_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + cosmic_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)