def parseGeo(filename): """parseGeo(filename): open a GEO soft file and return the records it contains.""" try: fh = gzip.open(filename) records = Geo.parse(fh) return records except: try: fh = open(filename) records = Geo.parse(fh) return records except: print "Could not open filename" sys.exit(3)
def parseGeo(filename): '''parseGeo(filename): open a GEO soft file and return the records it contains.''' try: fh = gzip.open(filename) records = Geo.parse(fh) return (records) except: try: fh = open(filename) records = Geo.parse(fh) return (records) except: print 'Could not open filename' sys.exit(3)
def parseGeo(filename): '''parseGeo(filename): open a GEO soft file and return the records it contains.''' try: fh = gzip.open(filename) records = Geo.parse(fh) return(records) except: try: fh = open(filename) records = Geo.parse(fh) return(records) except: print 'Could not open filename' sys.exit(3)
def parseGeo(filename): try: with gzip.open(filename) as fh: records = Geo.parse(fh) for record in records: print record except: try: with open(filename) as fh: records = Geo.parse(fh) for record in records: print record except: print 'Could not open filename' sys.exit(3)
def read_gene_expr_data(geneset_dict): """ Read gene expression data from file. Returns X - array of unnormalized gene expression data, grouped by genesets y - control and disease labels geneset - filtered genesets that match the given geneset dictionary, same order as returned X array """ handle = open(GENE_EXPR_FILENAME) records = Geo.parse(handle) # gsm ids of the normal subjects normal_subjects = [] # geneset row ids X_groups = {} for k in geneset_dict.keys(): X_groups[k] = set() X = [] y = [] i = 0 for record in records: i += 1 if i == 3: # Read patient labels so we can make the y vector attr = record.entity_attributes assert (attr["subset_description"] == "normal") normal_subjects = attr["subset_sample_id"].split(",") if i == 7: # Read actual gene expression data col_names = record.table_rows[0] gsm_idxs = [] for idx, col_name in enumerate(col_names): if "GSM" == col_name[0:3]: gsm_idxs.append(idx) # populate the y matrix # 1 means diseased. 0 means control. y.append(CONTROL_LABEL if col_name in normal_subjects else DISEASE_LABEL) geneid_idx = col_names.index("Gene ID") feature_idx = 0 for row in record.table_rows[1:]: geneid = row[geneid_idx] geneset = get_geneset_from_dict(geneset_dict, geneid) if geneset is not None: # add feature idx to correct geneset X_groups[geneset].add(feature_idx) # append the gene expression data X.append([float(row[i]) for i in gsm_idxs]) feature_idx += 1 # Make feature groups X = np.matrix(X).T X_genesets = [] genesets_included = [] for geneset_key, geneset_col_idxs in X_groups.iteritems(): if len(geneset_col_idxs) == 0: continue X_genesets.append(X[:, list(geneset_col_idxs)]) genesets_included.append(geneset_key) y = np.matrix(y).T return X_genesets, y, genesets_included
from Bio import Geo handle = open('./GDS3292/GDS3292_full.soft') records = Geo.parse(handle) for record in records: print record handle.close()
def read_gene_expr_data(geneset_dict): """ Read gene expression data from file. Returns X - array of unnormalized gene expression data, grouped by genesets y - control and disease labels geneset - filtered genesets that match the given geneset dictionary, same order as returned X array """ handle = open(GENE_EXPR_FILENAME) records = Geo.parse(handle) # gsm ids of the normal subjects normal_subjects = [] # geneset row ids X_groups = {} for k in geneset_dict.keys(): X_groups[k] = set() X = [] y = [] i = 0 for record in records: i += 1 if i == 3: # Read patient labels so we can make the y vector attr = record.entity_attributes assert(attr["subset_description"] == "normal") normal_subjects = attr["subset_sample_id"].split(",") if i == 7: # Read actual gene expression data col_names = record.table_rows[0] gsm_idxs = [] for idx, col_name in enumerate(col_names): if "GSM" == col_name[0:3]: gsm_idxs.append(idx) # populate the y matrix # 1 means diseased. 0 means control. y.append(CONTROL_LABEL if col_name in normal_subjects else DISEASE_LABEL) geneid_idx = col_names.index("Gene ID") feature_idx = 0 for row in record.table_rows[1:]: geneid = row[geneid_idx] geneset = get_geneset_from_dict(geneset_dict, geneid) if geneset is not None: # add feature idx to correct geneset X_groups[geneset].add(feature_idx) # append the gene expression data X.append([float(row[i]) for i in gsm_idxs]) feature_idx += 1 # Make feature groups X = np.matrix(X).T X_genesets = [] genesets_included = [] for geneset_key, geneset_col_idxs in X_groups.iteritems(): if len(geneset_col_idxs) == 0: continue X_genesets.append(X[:, list(geneset_col_idxs)]) genesets_included.append(geneset_key) y = np.matrix(y).T return X_genesets, y, genesets_included