コード例 #1
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import itertools
        from genomicode import config
        from genomicode import parallel
        from genomicode import filelib

        signal_node, annotation_node = antecedents
        signal_filename = signal_node.identifier
        annotation_filename = annotation_node.identifier
        filelib.assert_exists_nz(signal_filename)
        filelib.assert_exists_nz(annotation_filename)
        metadata = {}

        align_matrices = filelib.which_assert(config.align_matrices)

        # Make sure the signal_filename has an ID_REF header.
        header = filelib.read_cols(signal_filename).next()
        assert header[0] == "ID_REF", "Missing ID_REF header: %s" % \
               signal_filename

        signal_align_file = "signal.aligned.txt"
        annot_align_file = "annot.aligned.txt"

        # First, align the two files.
        sq = parallel.quote
        cmd = [
            sq(align_matrices),
            "--annot_file",
            signal_filename,
            "--header",
            "ID_REF",
            "--annot_file",
            annotation_filename,
            "--left_join",
            signal_align_file,
            annot_align_file,
        ]
        cmd = " ".join(cmd)
        parallel.sshell(cmd)
        metadata["command"] = cmd

        # Now merge them.  Take the first column of the expression
        # file (should be ID_REF), the whole annotation file, then the
        # remainder of the expression file.
        signal_handle = filelib.read_cols(signal_align_file)
        annot_handle = filelib.read_cols(annot_align_file)
        outhandle = open(outfile, 'w')
        for x1, x2 in itertools.izip(signal_handle, annot_handle):
            x = [x1[0]] + x2 + x1[1:]
            print >> outhandle, "\t".join(x)
        outhandle.close()

        #cmd = "paste %s %s > %s" % (
        #    annot_align_file, signal_align_file, outfile)
        #shell.single(cmd)

        filelib.assert_exists_nz(outfile)
コード例 #2
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        import os
        from genomicode import jmath
        in_data = antecedents
        matrix = [x for x in filelib.read_cols(in_data.identifier)]
        matrix = [x[1:] for x in matrix]
        matrix = jmath.transpose(matrix)
        sample = matrix[0][1:]
        data = matrix[1:]
        if not os.path.exists(outfile):
            os.mkdir(outfile)

        for one_data in data:
            value = one_data[1:]
            value = [float(i) for i in value]
            pair = [(value[i], sample[i]) for i in range(len(value))]
            pair.sort()
            gene_value = [i[0] for i in pair]
            label = [i[1] for i in pair]
            ylabel = one_data[0]
            from genomicode import mplgraph
            fig = mplgraph.barplot(gene_value,
                                   box_label=label,
                                   xtick_rotation=90,
                                   xlabel='sample',
                                   ylabel=ylabel)
            output = os.path.join(outfile, ylabel)
            fig.savefig(output + '.png')

        assert filelib.exists_nz(outfile), (
            'the output file %s for plot_geneset_score_bar fails' % outfile)
コード例 #3
0
def read_geneset_scores(filename):
    # Read the output from score_geneset.py and return a Matrix
    # object.
    import os
    from genomicode import jmath
    from genomicode import filelib
    from genomicode import Matrix
    from arrayio import const
    from arrayio import tab_delimited_format as tdf

    assert os.path.exists(filename)
    matrix = [x for x in filelib.read_cols(filename)]
    matrix = jmath.transpose(matrix)

    # Only want the scores.  Get rid of the direction, pvalue, and
    # significance lines.
    # Columns:
    # SAMPLE
    # FILE
    # [Score ...]
    # [Direction ...] " direction"
    # [p value ...] " pvalue"
    # [significant ...] " significant"
    assert matrix
    i = 0
    while i < len(matrix):
        assert matrix[i]
        metadata = False
        if matrix[i][0].endswith(" direction"):
            metadata = True
        elif matrix[i][0].endswith(" pvalue"):
            metadata = True
        elif matrix[i][0].endswith(" significant"):
            metadata = True
        if not metadata:
            i += 1
            continue
        del matrix[i]

    # BUG: Need more checks on size and format of matrix.
    col_names = {}
    sample_row = 0
    if matrix[1][0].upper() == "SAMPLE":
        sample_row = 1
    col_names[tdf.SAMPLE_NAME] = matrix[sample_row][1:]
    row_names = {}
    row_names['geneset'] = []
    synonyms = {}
    synonyms[const.COL_ID] = tdf.SAMPLE_NAME
    data = []
    for line in matrix[2:]:
        single_data = [jmath.safe_float(i) for i in line[1:]]
        data.append(single_data)
        row_names['geneset'].append(line[0])
    M = Matrix.InMemoryMatrix(data,
                              row_names=row_names,
                              col_names=col_names,
                              synonyms=synonyms)
    return M
コード例 #4
0
ファイル: bfrmnorm.py プロジェクト: firebitsbr/changlab
def label_control_probes(probe_ids, control_probe_file):
    # BFRM_Normalize expects control probes to start with "AFFX" in
    # all upper case.  Make sure I can find these probes.
    from genomicode import config
    from genomicode import filelib

    control_probes = {}

    # First, take a look to see if any affymetrix control probes
    # exist.
    for i, pid in enumerate(probe_ids):
        if not pid.upper().startswith("AFFX"):
            continue
        control_probes[pid.upper()] = 1

    # Use the probes from the control probe file if:
    # 1.  a control probe file is specified    OR
    # 2.  no affx probes exist (use a default control probe file).
    if not control_probes and not control_probe_file:
        control_probe_file = config.illumina_HUMANHT12_CONTROL
        assert os.path.exists(control_probe_file), \
               "I could not find any control probes."
    if control_probe_file:
        assert os.path.exists(control_probe_file), \
               "I could not find file: %s" % control_probe_file
        control_probes = {}
        for cols in filelib.read_cols(control_probe_file):
            for x in cols:
                control_probes[x.upper()] = 1

    # Hack: If it is an Illumina control probe, then prepend "AFFX_"
    # to it so that BFRM_Normalize will recognize it as a control.
    probe_ids = probe_ids[:]
    found = False
    for i, pid in enumerate(probe_ids):
        upid = pid.upper()
        is_control_probe = upid in control_probes
        if is_control_probe:
            found = True
        # If a probe is not a control and starts with AFFX, mask it
        # out so that BFRM_Normalize will not recognize it.
        if not is_control_probe and upid.startswith("AFFX"):
            pid = "AFF_" + pid[4:]
        # If a probe is a control and does not start with AFFX, add
        # AFFX so that BFRM_Normalize will recognize it.
        if is_control_probe and not upid.startswith("AFFX"):
            pid = "AFFX_%s" % pid
        if is_control_probe:
            assert pid.startswith("AFFX")
        else:
            assert not pid.startswith("AFFX")
        probe_ids[i] = pid

    assert found, "I could not find any control probes."
    return probe_ids
コード例 #5
0
def read_fastqc_summary(filename):
    # Return list of (<status>, <statistic>, <filename>)
    import os
    from genomicode import filelib

    assert os.path.exists(filename)
    data = []
    for x in filelib.read_cols(filename):
        assert len(x) == 3
        status, statistic, filename = x
        data.append((status, statistic, filename))
    return data
コード例 #6
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import mplgraph
        from genomicode import filelib
        in_data = antecedents
        matrix = [x for x in filelib.read_cols(in_data.identifier)]
        header = matrix[0]
        index = header.index('Confidence')
        matrix = matrix[1:]
        confidence = [float(i[index]) for i in matrix]
        sample = [i[0] for i in matrix]
        if confidence == [''] * len(matrix) or 'Correct?' in header:
            index = header.index('Predicted_class')
            class_value = [i[index] for i in matrix]
            label_dict = dict()
            label_list = []
            i = -1
            for label in class_value:
                if label not in label_dict.keys():
                    i = i + 1
                    label_dict[label] = i
                label_list.append(label_dict[label])
            yticks = label_dict.keys()
            ytick_pos = [label_dict[i] for i in label_dict.keys()]
            fig = mplgraph.barplot(label_list,
                                   box_label=sample,
                                   ylim=(-0.5, 1.5),
                                   ytick_pos=ytick_pos,
                                   yticks=yticks,
                                   xtick_rotation='vertical',
                                   ylabel='Prediction',
                                   xlabel='Sample')
            fig.savefig(outfile)
        else:
            fig = mplgraph.barplot(confidence,
                                   box_label=sample,
                                   ylim=(-1.5, 1.5),
                                   xtick_rotation='vertical',
                                   ylabel='Prediction',
                                   xlabel='Sample')
            fig.savefig(outfile)

    
        
        
        assert filelib.exists_nz(outfile), (
            'the output file %s for plot_prediction_bar fails' % outfile
        )
コード例 #7
0
def _convert_gene_ids_local(in_platform, out_platform):
    # Return a dictionary of gene_id -> list of converted_ids, or None
    # if these platforms cannot be converted.
    import os
    from genomicode import config
    from genomicode import filelib

    filelib.assert_exists_nz(config.convert_platform)
    x = "%s___%s.txt" % (in_platform, out_platform)
    filename = os.path.join(config.convert_platform, x)
    if not os.path.exists(filename):
        return None

    in2out = {}
    for cols in filelib.read_cols(filename):
        # <in_id>  <out_id1> ... <out_idn>
        assert len(cols) >= 2
        in_id = cols[0]
        out_ids = cols[1:]
        in2out[in_id] = out_ids
    return in2out
コード例 #8
0
def merge_parsed_files(parsed_files, outfile):
    # First, make sure each of the parsed files has the same header.
    from genomicode import filelib

    assert parsed_files

    header = None
    for f in parsed_files:
        cols = filelib.read_cols(f).next()
        if not header:
            header = cols
        assert header == cols, "Mismatched headers"
    assert header

    handle = open(outfile, 'w')
    seen = {}
    for f in parsed_files:
        for line in filelib.openfh(f):
            if line in seen:
                continue
            seen[line] = 1
            print >> handle, line,
コード例 #9
0
def fix_cluster30_dup_header(filename):
    # Cluster30 creates a file with "NAME" as the header for the third
    # column.  If the infile also has a "NAME" column, then this will
    # be duplicated.  Detect this situation and fix it.
    from genomicode import filelib

    filelib.assert_exists_nz(filename)
    matrix = [x for x in filelib.read_cols(filename)]
    assert matrix
    assert matrix[0]
    header = matrix[0]
    # GID  <COL0>  NAME  GWEIGHT  <COL1>  [<SAMPLES>...]
    assert len(header) >= 5
    changed = False
    if header[1] == "NAME" and header[2] == "NAME":
        header[1] = "NAME_"
        changed = True
    if not changed:
        return
    handle = open(filename, 'w')
    for x in matrix:
        print >> handle, "\t".join(x)
コード例 #10
0
def _read_vcf(filename):
    # Return a tuple of:
    # - a list of lines.  Each line is a list of columns.
    # - the index of the header row (or None)
    # - the sample names
    from genomicode import filelib

    lines = [x for x in filelib.read_cols(filename)]
    header_i = None
    for i, cols in enumerate(lines):
        if cols[0] == "#CHROM":
            header_i = i
            break
    assert header_i is not None, "Could not find #CHROM: %s" % filename

    header = lines[header_i]
    x = [
        "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"
    ]
    assert header[:len(x)] == x, "Unknown format: %s" % header
    samples = header[len(x):]
    return lines, header_i, samples
コード例 #11
0
def _make_intervallist_file(intervallist_file, features_bed, bam_filename):
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    outhandle = open(intervallist_file, 'w')

    # Add the @HD and @SQ headers from the bam file.
    # samtools view -H <filename>
    samtools = filelib.which_assert(config.samtools)
    sq = parallel.quote
    cmd = [
        sq(samtools),
        "view",
        "-H",
        sq(bam_filename),
    ]
    cmd = " ".join(cmd)
    x = parallel.sshell(cmd)
    lines = x.split("\n")
    lines = [x.rstrip() for x in lines]

    for line in lines:
        if line.startswith("@HD") or line.startswith("@SQ"):
            print >> outhandle, line

    # Add the information from the BAM files.
    # BED       chrom chromStart (0-based) chromEnd name score strand
    # Interval  chrom chromStart (1-based) chromEnd strand name
    for cols in filelib.read_cols(features_bed):
        assert len(cols) >= 6
        chrom, chromStart0, chromEnd, name, score, strand = cols[:6]
        chromStart0, chromEnd = int(chromStart0), int(chromEnd)
        chromStart1 = chromStart0 + 1
        x = chrom, chromStart1, chromEnd, strand, name
        print >> outhandle, "\t".join(map(str, x))
    outhandle.close()
コード例 #12
0
def list_snpeff_databases():
    import os
    import StringIO
    from genomicode import parallel
    from genomicode import filelib
    from Betsy import module_utils as mlib

    path = mlib.get_config("snp_eff_path", which_assert_file=True)
    snpeff = os.path.join(path, "snpEff.jar")
    filelib.assert_exists_nz(snpeff)

    # Genome    Organism    Status    Bundle    Database download link
    # ------    --------    ------    ------    ----------------------
    sq = parallel.quote
    cmd = [
        "java",
        "-Xmx16g",
        "-jar",
        sq(snpeff),
        "databases",
    ]
    output = parallel.sshell(cmd)
    header = i_db = None
    databases = []
    for cols in filelib.read_cols(StringIO.StringIO(output)):
        cols = [x.strip() for x in cols]
        if header is None:
            header = cols
            assert "Genome" in header
            i_db = header.index("Genome")
            continue
        assert len(cols) == len(header)
        if cols[0].startswith("---"):
            continue
        db_name = cols[i_db]
        databases.append(db_name)
    return databases
コード例 #13
0
ファイル: download_tcga.py プロジェクト: firebitsbr/changlab
def format_firehose_mirna(filename, output):
    matrix = [x for x in filelib.read_cols(filename)]
    HYB_REF = "Hybridization REF"
    GENE_ID = "miRNA_ID"
    assert matrix
    assert matrix[0][0] == HYB_REF
    assert matrix[1][0] == GENE_ID
    header0 = matrix[0]
    header1 = matrix[1]
    for i in range(1, len(header1), 3):
        assert header1[i] == "read_count"
        assert header1[i + 1] == "reads_per_million_miRNA_mapped"
        assert header1[i + 2] == "cross-mapped"

    sample_name = [header0[i] for i in range(2, len(header0), 3)]
    header = ["miRNA ID"] + sample_name
    f = file(output, 'w')
    f.write("\t".join(header) + '\n')
    for i in range(2, len(matrix)):
        x = [matrix[i][j] for j in range(2, len(matrix[i]), 3)]
        x = [matrix[i][0]] + x
        assert len(x) == len(header)
        f.write("\t".join(x) + '\n')
    f.close()
コード例 #14
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import StringIO
        import arrayio
        from genomicode import arrayplatformlib
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        M = arrayio.read(in_data.identifier)
        metadata = {}

        # Add GENE_ID, GENE_SYMBOL, and DESCRIPTION.  Figure out which
        # platforms provide each one of this.
        CATEGORIES = [
            arrayplatformlib.GENE_ID,
            arrayplatformlib.GENE_SYMBOL,
            # biomaRt doesn't convert description.  So just ignore it
            # for now.
            # TODO: implement DESCRIPTION.
            #arrayplatformlib.DESCRIPTION,
        ]

        #all_platforms = arrayplatformlib.identify_all_platforms_of_matrix(M)
        #assert all_platforms, "Unknown platform: %s" % in_data.identifier
        #header, platform_name = all_platforms[0]
        scores = arrayplatformlib.score_matrix(M)
        scores = [x for x in scores if x.max_score >= 0.75]
        assert scores, "I could not identify any platforms."

        # Find all the platforms not in the matrix.
        platforms = [
            arrayplatformlib.find_platform_by_name(x.platform_name)
            for x in scores
        ]
        categories = [x.category for x in platforms]
        missing = [x for x in CATEGORIES if x not in categories]

        score = scores[0]
        platform = platforms[0]
        to_add = []  # list of platform names
        for category in missing:
            x = arrayplatformlib.PLATFORMS
            x = [x for x in x if x.category == category]
            x = [x for x in x if x.bm_organism == platform.bm_organism]
            x = [x for x in x if x.name != score.platform_name]
            # Take the first one, if any.
            if x:
                to_add.append(x[0].name)

        if to_add:
            annotate = mlib.get_config("annotate_matrix",
                                       which_assert_file=True)
            sq = parallel.quote
            cmd = [
                "python",
                sq(annotate),
                "--no_na",
                "--header",
                sq(score.header),
            ]
            for x in to_add:
                x = ["--platform", sq(x)]
                cmd.extend(x)
            cmd.append(in_data.identifier)
            cmd = " ".join(cmd)
            data = parallel.sshell(cmd)
            metadata["commands"] = [cmd]
            assert data.find("Traceback") < 0, data
        else:
            data = open(in_data.identifier).read()

        # Clean up the headers.
        platform2pretty = {
            "Entrez_ID_human": "Gene ID",
            "Entrez_Symbol_human": "Gene Symbol",
            "Entrez_ID_mouse": "Gene ID",
            "Entrez_Symbol_mouse": "Gene Symbol",
        }
        handle = open(outfile, 'w')
        header_written = False
        for cols in filelib.read_cols(StringIO.StringIO(data)):
            if not header_written:
                cols = [platform2pretty.get(x, x) for x in cols]
                cols = AnnotationMatrix.uniquify_headers(cols)
                header_written = True
            print >> handle, "\t".join(cols)

        return metadata
コード例 #15
0
def read_as_am(filename, is_csv=False):
    # Read file in SVM format.  Return an AnnotationMatrix object.
    # Does no special processing on any columns (i.e. no parsing as
    # integers or Call objects).  Everything is a string.

    # Header format:  <header0>___<header1>___<header2>
    # "blanks" are filled in.  E.g. "Annovar" occurs in each Annovar
    # column in header0.
    #
    # Headers:
    # ______Chrom
    # ______Pos
    # ______Ref
    # ______Alt
    # Num Callers______<Sample>
    # ...
    from genomicode import filelib
    from genomicode import AnnotationMatrix

    delimiter = "\t"
    if is_csv:
        delimiter = ","

    matrix = []
    for x in filelib.read_cols(filename, delimiter=delimiter):
        matrix.append(x)
    assert len(matrix) >= 3  # at least 3 rows for the header
    for i in range(1, len(matrix)):
        assert len(matrix[i]) == len(matrix[0])
    assert len(matrix[0]) >= 4  # Chrom, Pos, Ref, Alt
    assert len(matrix[0]) >= 5, "No calls"

    header0 = matrix[0]
    header1 = matrix[1]
    header2 = matrix[2]
    assert header2[:4] == ["Chrom", "Pos", "Ref", "Alt"]

    # Fill in the blanks for header1.
    for i in range(1, len(header1)):
        if header1[i]:
            continue
        # header1[i] is blank.  If header0[i], then this starts a new
        # "block".  Start with a new header1, and do not copy the old
        # one over.
        if not header1[i] and not header0[i]:
            header1[i] = header1[i - 1]
    # Fill in the blanks for header0.
    for i in range(1, len(header0)):
        if not header0[i]:
            header0[i] = header0[i - 1]

    # Make a list of all samples.
    I = [i for (i, x) in enumerate(header2) if x == "Ref/Alt/VAF"]
    assert I
    x = [header0[i] for i in I]
    x = [x for x in x if x]
    # Get rid of duplicates, preserving order.
    x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]]
    samples = x

    # Make a list of all callers.
    x = [header1[i] for i in I]
    x = [x for x in x if x]
    # Get rid of duplicates, preserving order.
    x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]]
    callers = x

    headers = []
    for x in zip(header0, header1, header2):
        x = "___".join(x)
        headers.append(x)
    all_annots = []
    for j in range(len(headers)):
        annots = [x[j] for x in matrix[3:]]
        all_annots.append(annots)
    matrix = AnnotationMatrix.create_from_annotations(headers, all_annots)
    matrix.samples = samples
    matrix.callers = callers
    return matrix
コード例 #16
0
def main():
    import os
    import argparse
    import itertools

    from genomicode import filelib
    from genomicode import config
    from genomicode import parallel
    from genomicode import alignlib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("reference_genome", help="fasta file")

    parser.add_argument("-j",
                        dest="num_procs",
                        type=int,
                        default=1,
                        help="Number of jobs to run in parallel.")
    parser.add_argument(
        "--dry_run",
        action="store_true",
        help="Just display the commands, and don't generate the alignment.")
    parser.add_argument("--window",
                        default=80,
                        type=int,
                        help="Number of bases in alignment.  Default: 80")

    group = parser.add_argument_group(title="Input")
    group.add_argument("--bam_file", help="Indexed BAM file.")
    group.add_argument("--bam_path", help="Path to BAM files.")
    group.add_argument(
        "--position",
        action="append",
        default=[],
        help="Specify a position to view, "
        "e.g. chr20:45,927,663 or chr20:45927663.  1-based coordinates")
    group.add_argument("--position_file",
                       help="Tab-delimited text file with two columns.  "
                       "Column 1 is chromosome, column 2 is position.")

    group = parser.add_argument_group(title="Output")
    group.add_argument("--prefix", help="Pre-pend a prefix to each outfile.")
    group.add_argument(
        "--outpath",
        help="If multiple alignments are generated, this option "
        "directs where to save the output files.")
    group.add_argument(
        "--noclobber",
        action="store_true",
        help="If an output file already exists, don't overwrite it.")

    # Parse the input arguments.
    args = parser.parse_args()
    filelib.assert_exists_nz(args.reference_genome)
    assert args.bam_file or args.bam_path, \
           "Either --bam_file or --bam_path must be provided."
    assert not (args.bam_file and args.bam_path), \
           "Cannot specify both --bam_file or --bam_path."
    if args.bam_file:
        filelib.assert_exists_nz(args.bam_file)
    if args.bam_path:
        assert os.path.exists(args.bam_path)
    if args.position_file:
        filelib.assert_exists_nz(args.position_file)
    if args.outpath and not os.path.exists(args.outpath):
        os.mkdir(args.outpath)
    if args.num_procs < 1 or args.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    assert args.window >= 1 and args.window < 500

    bam_filenames = []
    if args.bam_file:
        bam_filenames.append(args.bam_file)
    else:
        x = os.listdir(args.bam_path)
        x = [x for x in x if x.endswith(".bam")]
        x = [os.path.join(args.bam_path, x) for x in x]
        bam_filenames = x
    assert bam_filenames, "No bam files found."

    positions = []  # list of (chrom, pos)
    for x in args.position:
        chrom, pos = _parse_position(x)
        positions.append((chrom, pos))
    if args.position_file and os.path.exists(args.position_file):
        for cols in filelib.read_cols(args.position_file):
            assert len(cols) == 2, "Position file should have 2 columns"
            chrom, pos = cols
            pos = int(pos)
            assert pos >= 1
            positions.append((chrom, pos))
    assert positions, "No positions specified."

    # Make the commands.
    assert hasattr(config, "samtools")
    filelib.assert_exists(config.samtools)

    # Make sure we have the right version of samtools.
    # 1.2 (using htslib 1.2.1)
    # 0.1.18 (r982:295)
    version = alignlib.get_samtools_version()
    x = version.split(".")
    assert len(x) >= 2
    major = x[0]
    assert major in ["0", "1"], "Unknown samtools version: %s" % version
    major = int(major)
    assert major >= 1, "Requires samtools >= 1 (Current version: %s)" % version

    commands = []
    for x in itertools.product(bam_filenames, positions):
        bam_filename, (chrom, pos) = x

        p, f = os.path.split(bam_filename)
        sample, e = os.path.splitext(f)

        left = max(pos - args.window / 2, 1)
        pos_str = "%s:%s" % (chrom, left)

        x = "%2s.%9s.%s.html" % (chrom, pos, sample)
        if args.prefix:
            x = "%s.%s" % (args.prefix, x)
        if args.outpath:
            x = os.path.join(args.outpath, x)
        out_filename = x

        if args.noclobber and os.path.exists(out_filename):
            continue

        # samtools tview -d t -p 7:100550778 bam01/196B-lung.bam $FA
        sq = parallel.quote
        x = [
            sq(config.samtools),
            "tview",
            "-d",
            "h",
            "-p",
            pos_str,
            sq(bam_filename),
            sq(args.reference_genome),
        ]
        x = " ".join(x)
        x = "%s >& %s" % (x, sq(out_filename))
        commands.append(x)

    if args.dry_run:
        for x in commands:
            print x
        return

    parallel.pshell(commands, max_procs=args.num_procs)
コード例 #17
0
ファイル: download_tcga.py プロジェクト: firebitsbr/changlab
def merge_rppa_files(in_files, out_file):
    import shutil
    from genomicode import filelib

    assert len(in_files) == 2
    x1 = [x for x in in_files if x.endswith(".antibody_annotation.txt")]
    x2 = [x for x in in_files if x.endswith(".rppa.txt")]
    assert len(x1) == 1
    assert len(x2) == 1
    annotation_file = x1[0]
    data_file = x2[0]

    # Actually, just return the data_file.  It contains all the
    # information we need.
    shutil.copy2(data_file, out_file)
    return

    # OV.antibody_annotation.txt
    # Gene Name  Composite Element REF
    # YWHAB      14-3-3_beta
    # YWHAE      14-3-3_epsilon
    # YWHAZ      14-3-3_zeta
    # EIF4EBP1   4E-BP1
    # EIF4EBP1   4E-BP1_pS65

    # OV.rppa.txt
    # Composite.Element.REF     TCGA-04-1335-01A-21-1561-20
    # YWHAB|14-3-3_beta         -0.00855276625000018
    # YWHAE|14-3-3_epsilon      0.05985423025
    # YWHAZ|14-3-3_zeta         -0.04074335825
    # EIF4EBP1|4E-BP1           -0.62276845725
    # EIF4EBP1|4E-BP1_pS65      0.00776960074999994
    # EIF4EBP1|4E-BP1_pT37_T46  -0.04959447325

    # Make sure these files are aligned properly.
    M1 = [x for x in filelib.read_cols(annotation_file)]
    M2 = [x for x in filelib.read_cols(data_file)]
    assert M1 and M2
    assert M1[0][0] == "Gene Name"
    assert M1[0][1] == "Composite Element REF"
    assert M2[0][0] == "Composite.Element.REF"
    assert len(M1) == len(M2)

    # Make sure the header names don't conflict.
    M1[0][1] = "Antibody"

    for i in range(1, len(M1)):
        name1 = M1[i][0]
        x = M2[i][0]
        x = x.split("|")
        assert len(x) == 2
        name2, antibody = x
        assert name1 == name2

    M = []
    for i in range(len(M1)):
        x = M1[i] + M2[i]
        M.append(x)

    handle = open(out_file, 'w')
    for x in M:
        print >> handle, "\t".join(x)
コード例 #18
0
def extract_signal(filename, outhandle):
    import os
    import tempfile
    from genomicode import filelib

    # Write stuff to file to handle large data sets.
    tmpfile1 = tmpfile2 = tmpfile3 = None
    try:
        # tmpfile1        Raw signal data from series matrix file.
        # tmpfile2.<num>  Raw data split into separate tables.
        # tmpfile3        Final merged signal table.
        x, tmpfile1 = tempfile.mkstemp(dir=".")
        os.close(x)
        x, tmpfile2 = tempfile.mkstemp(dir=".")
        os.close(x)
        x, tmpfile3 = tempfile.mkstemp(dir=".")
        os.close(x)

        # Get a list of all lines in the series matrix tables.
        handle = open(tmpfile1, 'w')
        in_matrix_table = 0
        for cols in filelib.read_cols(filename):
            # Some files can have blank lines.
            if not cols:
                continue
            if cols[0] == "!series_matrix_table_begin":
                in_matrix_table = 1
            elif cols[0] == "!series_matrix_table_end":
                in_matrix_table = 0
            elif in_matrix_table:
                cols = [remove_quotes(x).strip() for x in cols]
                print >> handle, "\t".join(cols)
        handle.close()
        handle = None

        # Split the data into separate tables.
        num_tables = 0
        for line in filelib.openfh(tmpfile1):
            if line.startswith("ID_REF"):
                handle = open("%s.%d" % (tmpfile2, num_tables), 'w')
                num_tables += 1
            assert handle
            print >> handle, line,
        if handle:
            handle.close()
        assert num_tables

        # Sometimes the tables will not be aligned.
        # E.g. GSE9899-GPL570 contains two tables, and the 2nd is
        # missing some probe sets.  Get a list of the probe sets in
        # the tables.
        files = ["%s.%d" % (tmpfile2, i) for i in range(num_tables)]
        matrices = [FileMatrix(x) for x in files]
        id2indexes = []
        for matrix in matrices:
            id2index = {}
            for i, row in enumerate(matrix):
                id_ = row[0]
                id2index[id_] = i
            id2indexes.append(id2index)

        # Make a list of all the IDs.
        all_ids = {}
        for id2index in id2indexes:
            for id_ in id2index:
                all_ids[id_] = 1
        del all_ids["ID_REF"]
        all_ids = all_ids.keys()
        all_ids.sort()
        all_ids = ["ID_REF"] + all_ids

        # Align the indexes.
        #num_rows = row_names = None
        #for i in range(num_tables):
        #    filename = "%s.%d" % (tmpfile2, i)
        #    rname, nrow = [], 0
        #    for line in openfh(filename):
        #        x = line.split("\t", 1)[0]
        #        rname.append(x)
        #        nrow += 1
        #    if num_rows is None:
        #        num_rows = nrow
        #    if row_names is None:
        #        row_names = rname
        #    assert num_rows == nrow, "table is unaligned"
        #    assert row_names == rname

        # Merge all the pieces together into one big table.
        handle = open(tmpfile3, 'w')
        for id_ in all_ids:
            cols = []
            for matrix, id2index in zip(matrices, id2indexes):
                if id_ in id2index:
                    x = matrix[id2index[id_]]
                else:
                    # If this ID is missing, then just insert blank values.
                    x = [""] * len(matrix[0])
                if cols:
                    # If this is not the first matrix, then delete the
                    # row names.
                    x = x[1:]
                cols.extend(x)
            print >> handle, "\t".join(cols)
        handle.close()

        num_rows = len(all_ids)
        num_cols = len(filelib.read_cols(tmpfile3).next())

        # Figure out which expression values are missing.
        data_missing = {}
        for i, cols in enumerate(filelib.read_cols(tmpfile3)):
            assert len(cols) == num_cols, "line %d unaligned [%d:%d]" % (
                i, len(cols), num_cols)
            if i == 0:
                continue
            for j in range(1, len(cols)):
                try:
                    float(cols[j])
                except ValueError, x:
                    data_missing[(i, j)] = 1
                if cols[j] == "nan":
                    data_missing[(i, j)] = 1

        ## Remove the samples where >50% values are missing.
        #col_missing = [0] * num_cols   # number of values missing in each col
        #for i, j in data_missing:
        #    col_missing[j] += 1

        good_cols = [0]
        for i in range(1, num_cols):
            #if col_missing[i] > 0.50*(num_rows-1):  # -1 for the row names
            #    continue
            good_cols.append(i)

        ## Remove the genes where any value is missing.
        #row_missing = [0] * num_rows
        #for i, j in data_missing:
        #    if j not in good_cols:   # ignore samples that are already dropped
        #        continue
        #    row_missing[i] += 1

        good_rows = [0]
        for i in range(1, num_rows):
            #if row_missing[i] > 0:  # a value is missing.
            #    continue
            good_rows.append(i)

        assert len(good_cols) > 1, "no data"
        assert len(good_rows) > 1, "no data"

        # Write out the data.
        for i, cols in enumerate(filelib.read_cols(tmpfile3)):
            if i not in good_rows:
                continue
            x = [x for (i, x) in enumerate(cols) if i in good_cols]
            print >> outhandle, "\t".join(x)
コード例 #19
0
def read(filename, is_csv=False):
    # Everything are strings.  No numeric conversion.
    from genomicode import filelib
    #from genomicode import AnnotationMatrix

    delimiter = "\t"
    if is_csv:
        delimiter = ","

    matrix = []
    for x in filelib.read_cols(filename, delimiter=delimiter):
        matrix.append(x)
        #if len(matrix) > 50000:  # DEBUG
        #    break
    assert len(matrix) >= 3  # at least 3 rows for the header
    for i in range(1, len(matrix)):
        assert len(matrix[i]) == len(matrix[0])
    assert len(matrix[0]) >= 4  # Chrom, Pos, Ref, Alt
    assert len(matrix[0]) >= 5, "No calls"

    header0 = matrix[0]
    header1 = matrix[1]
    header2 = matrix[2]
    #assert header0[0] == "Sample"
    #assert header1[0] == "Caller"
    assert header2[:4] == ["Chrom", "Pos", "Ref", "Alt"]

    # Make a list of all samples.
    I = [i for (i, x) in enumerate(header2) if x == "Ref/Alt/VAF"]
    assert I
    x = [header0[i] for i in I]
    #x = header0[1:]
    x = [x for x in x if x]
    # Get rid of duplicates, preserving order.
    x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]]
    samples = x

    # Make a list of all callers.
    x = [header1[i] for i in I]
    #x = header1[1:]
    x = [x for x in x if x]
    # Get rid of duplicates, preserving order.
    x = [x[i] for (i, y) in enumerate(x) if y not in x[:i]]
    callers = x

    # Figure out where the annotations end.
    for i in range(1, len(header0)):
        if header0[i]:
            break
    else:
        raise AssertionError, "No calls"
    annot_end = i

    # Make the annotation matrix.
    annot_header = header2[:annot_end]
    annot_data = [x[:annot_end] for x in matrix[3:]]

    # Find the start coordinates of the named matrices.
    x = [i for (i, x) in enumerate(header0) if x]
    x = [i for i in x if i not in I]
    I_named = x  # list of start index of the named matrices.
    I_coord = []  # list of (start, end) of named matrices.
    for i in range(len(I_named)):
        i_start = I_named[i]
        if i + 1 < len(I_named):
            i_end = I_named[i + 1]
        else:
            i_end = I[0]
        I_coord.append((i_start, i_end))
    # Make the named matrices.
    named_data = []  # list of (name, named_header, named_annots)
    for (i_start, i_end) in I_coord:
        name = header0[i_start]
        assert name
        named_header = header2[i_start:i_end]
        M = [x[i_start:i_end] for x in matrix[3:]]
        named_annots = []
        for j in range(len(named_header)):
            x = [M[i][j] for i in range(len(M))]
            named_annots.append(x)
        x = name, named_header, named_annots
        named_data.append(x)

    # Make the call_data.
    call_data = []
    header_samples = [None] * len(header0)
    for i in I:
        if header0[i]:
            header_samples[i] = header0[i]
        else:
            header_samples[i] = header_samples[i - 1]
        assert header_samples[i]
    header_callers = [None] * len(header1)
    for i in I:
        header_callers[i] = header1[i]
        assert header_callers[i]
    for i in range(3, len(matrix)):
        chrom, pos, ref, alt = matrix[i][:4]
        pos = int(pos)
        for j in I:
            sample, caller = header_samples[j], header_callers[j]
            if not matrix[i][j]:
                continue
            call = _parse_call(matrix[i][j])
            x = chrom, pos, ref, alt, sample, caller, call
            call_data.append(x)

    return make_matrix(samples, callers, annot_header, annot_data, named_data,
                       call_data)
コード例 #20
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import config
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import filelib
        from Betsy import module_utils

        bam_node, ref_node, pos_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Positions file has 0-based coordinates (like BAM files).
        # But samtools requires 1-based coordinates.  Convert to
        # 1-based coordinates.
        positions_filename = "positions.txt"
        outhandle = open(positions_filename, 'w')
        for x in filelib.read_cols(pos_node.identifier):
            assert len(x) == 2
            chrom, pos = x
            pos = int(pos) + 1  # convert from 0- to 1-based coords.
            x = chrom, pos
            print >> outhandle, "\t".join(map(str, x))
        outhandle.close()

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            sample, ext = os.path.splitext(f)
            err_filename = os.path.join(out_path, "%s.log" % sample)
            out_filename = os.path.join(out_path, "%s.pileup" % sample)
            x = filelib.GenericObject(in_filename=in_filename,
                                      err_filename=err_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        ## Get possible positions file.
        #positions_filename = module_utils.get_user_option(
        #    user_options, "positions_file", check_file=True)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["all", "snp", "indel", "consensus"]
        #if cov == "yes":
        #    assert positions_filename, "Missing: positions_file"

        # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \
        #   -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \
        #   $i > $j"
        samtools = filelib.which_assert(config.samtools)

        # Get an error if the BAM files are not indexed.
        # [W::bam_hdr_read] EOF marker is absent. The input is probably
        #   truncated.

        #if vartype == "consensus":
        #    args = [
        #        "-R",        # Ignore read group tags.
        #        "-B",        # Disable BAQ (base quality) computation.
        #        "-q", 0,     # Skip bases with mapQ smaller than this.
        #        "-Q", 0,     # Skip bases with BAQ smaller than this.
        #        "-d10000000",  # Allow deep reads.
        #        ]
        #else:
        #    raise NotImplementedError
        args = [
            "-R",  # Ignore read group tags.
            "-B",  # Disable BAQ (base quality) computation.
            "-q",
            0,  # Skip bases with mapQ smaller than this.
            "-Q",
            0,  # Skip bases with BAQ smaller than this.
            "-d10000000",  # Allow deep reads.
        ]

        sq = parallel.quote
        commands = []
        for j in jobs:
            x = [
                sq(samtools),
                "mpileup",
                "-f",
                sq(ref.fasta_file_full),
            ]
            if positions_filename:
                x.extend(["-l", positions_filename])
            x.extend(args)
            x.append(sq(j.in_filename))
            x = " ".join(map(str, x))
            x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] = commands

        # File may be empty if there are no reads.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_many(x)

        # Make sure there's no errors in the log files.
        for j in jobs:
            check_log_file(j.err_filename)

        return metadata