def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import genesetlib
        import plot_sample_pca
        data_node, classify_node = antecedents
        result_data = genesetlib.read_tdf(classify_node.identifier,
                                          preserve_spaces=True,
                                          allow_duplicates=True)
        for i in result_data:
            if i[0] == 'Predicted_class':
                legend = i[2]

        colors = ['r', 'b', 'g', 'y']
        legend_dict = {}
        for index, item in enumerate(legend):
            if item not in legend_dict:
                legend_dict[item] = [index]
            else:
                legend_dict[item].append(index)

        color = [''] * len(legend)
        for index, key in enumerate(legend_dict.keys()):
            c = colors[index]
            for i in legend_dict[key]:
                color[i] = c

        plot_sample_pca.plot_pca(data_node.identifier, outfile, color, legend)
Ejemplo n.º 2
0
def read_annotation_descriptor(annotation_descriptor, annotations):
    # Return list of (header, annots), where the annots are aligned to
    # the annotations.
    from genomicode import jmath
    from genomicode import genesetlib
    
    filename = annotation_descriptor
    assert os.path.exists(filename), "I could not find file: %s" % filename

    header2annots = []  # list of (header, annots)
    for x in genesetlib.read_tdf(
        filename, preserve_spaces=True, allow_duplicates=True):
        name, description, annots = x
        header2annots.append((name, annots))
    assert header2annots, "No annots."

    # Find the column that contains the annotations.
    header2counts = {}
    for (header, annots) in header2annots:
        count = len(set(annotations).intersection(annots))
        header2counts[header] = count
    best_header = best_count = None
    for (header, count) in header2counts.iteritems():
        if best_count is None or count > best_count:
            best_header, best_count = header, count
    assert best_count >= len(annotations)/2.0, \
                  "I could not find the annotations in the descriptor file."
    annot_header = best_header

    # Align the annotation matrix to the annotations.
    annot_annots = None
    for (header, annots) in header2annots:
        if header == annot_header:
            annot_annots = annots
    assert annot_annots
    I = jmath.match(annotations, annot_annots)
    
    header2annots_aligned = []
    for header, annots in header2annots:
        annots_aligned = []
        for i in I:
            if i is None:
                annots_aligned.append("")
            else:
                annots_aligned.append(annots[i])
        x = header, annots_aligned
        header2annots_aligned.append(x)
    header2annots = header2annots_aligned

    return header2annots
Ejemplo n.º 3
0
def read(filename, is_csv=False, header_char=None, nrows=None):
    # Everything are strings.  No numeric conversion.
    import re
    from genomicode import genesetlib

    delimiter = "\t"
    if is_csv:
        delimiter = ","

    # re.sub takes a lot of time (25% of all running time!).  Compile
    # it.
    re_naive = re.compile("na\\W+ve")

    all_headers, all_annots = [], []
    all_comments = []
    for x in genesetlib.read_tdf(filename,
                                 preserve_spaces=True,
                                 allow_duplicates=True,
                                 delimiter=delimiter,
                                 yield_lines_startswith=header_char,
                                 nrows=nrows):
        if type(x) is type(""):
            all_comments.append(x)
            continue
        name, description, annots = x

        # Hack: Some files contain special characters, which mess up
        # alignment. Fix this here.
        # na\xc3\xafve-WIBR3.5 hESC
        # na\xe2\x80\x9a\xc3\xa0\xc3\xb6\xe2\x88\x9a\xc3\xb2ve-C1.2 hiPSC
        #annots = [re.sub("na\\W+ve", "naive", x) for x in annots]
        # This takes a long time.  Don't do it unless necessary.
        if False:
            annots = [re_naive.sub("naive", x) for x in annots]

        all_headers.append(name)
        all_annots.append(annots)
    assert all_headers, "Empty file: %s" % filename

    headers_h = uniquify_headers(all_headers)
    header2annots = {}
    for (header_h, annots) in zip(headers_h, all_annots):
        header2annots[header_h] = annots
    return AnnotationMatrix(all_headers,
                            headers_h,
                            header2annots,
                            headerlines=all_comments)
Ejemplo n.º 4
0
def read_gene_descriptor(gene_descriptor, geneset):
    # Read pretty names for the genes.  gene_descriptor is in the
    # format of <filename>,<header>.  Return a dictionary of gene ->
    # pretty name.
    
    from genomicode import genesetlib

    x = gene_descriptor.split(",")
    assert len(x) >= 2
    filename, pretty_header = x
    assert os.path.exists(filename), "I could not find file: %s" % filename

    header2genes = {}
    for x in genesetlib.read_tdf(
        filename, preserve_spaces=True, allow_duplicates=True):
        name, description, genes = x
        header2genes[name] = genes
    assert header2genes, "No genes."

    # Find the column that contains the genes in the gene set.  Since
    # some of the genes may not be annotated, provide some leeway
    # here.
    header2counts = {}
    for (header, genes) in header2genes.iteritems():
        count = len(set(geneset).intersection(genes))
        header2counts[header] = count
    best_header = best_count = None
    for (header, count) in header2counts.iteritems():
        if best_count is None or count > best_count:
            best_header, best_count = header, count
    assert best_count >= len(geneset)/2.0, \
                  "I could not find the genes in the descriptor file."

    gene_header = best_header
    genes = header2genes[gene_header]
    pretty = header2genes[pretty_header]
    assert len(genes) == len(pretty)

    gene2pretty = {}
    for g, p in zip(genes, pretty):
        if not g or not p:
            continue
        gene2pretty[g] = p
    
    return gene2pretty
Ejemplo n.º 5
0
def read_infile(filename):
    from genomicode import genesetlib

    name_order = []
    name2annots = {}
    num_annots = None
    for x in genesetlib.read_tdf(filename,
                                 preserve_spaces=True,
                                 allow_duplicates=True):
        name, description, annots = x

        if num_annots is None:
            num_annots = len(annots)
        assert len(annots) == num_annots

        name_order.append(name)
        name2annots[name] = annots
    return AnnotationMatrix(name2annots, name_order)
Ejemplo n.º 6
0
def read_clinical_annotations(M, filename):
    # Return a tuple of (Matrix, clinical annotations).  The
    # annotations are a dictionary of name -> list of values.  They
    # are aligned with the matrix.

    from genomicode import genesetlib

    clinical_annots = {}
    for x in genesetlib.read_tdf(filename,
                                 preserve_spaces=True,
                                 allow_duplicates=True):
        name, description, values = x
        clinical_annots[name] = values

    # Align the gene scores with the clinical annotations.
    x = align_matrix_with_clinical_data(M, clinical_annots)
    M, clinical_annots = x

    return M, clinical_annots