def merge(args): filename = args.sample_set_file sep = args.sep name = args.name desc = args.desc sample_set_names = args.sample_set_names # check arguments if name is None: name = 'merge' if desc is None: desc = name if not sample_set_names: logging.error('No sample set names specified') return 1 sample_sets = dict((ss.name,ss) for ss in _parse_sample_sets(filename, sep)) new_value_dict = collections.defaultdict(lambda: 0) for ss_name in sample_set_names: if ss_name not in sample_sets: logging.error('Sample set name "%s" not found.. Exiting.' % (ss_name)) return 1 ss = sample_sets[ss_name] for k,v in ss.value_dict.iteritems(): if k in new_value_dict: continue new_value_dict[k] = v ss = SampleSet(name, desc, new_value_dict.items()) print ss.to_json()
def _parse_sample_sets(filename, sep): if not os.path.exists(filename): logging.error("Sample set file '%s' not found" % (filename)) sys.exit(1) ext = os.path.splitext(filename)[-1] if ext == '.smx': for ss in SampleSet.parse_smx(filename, sep): yield ss elif ext == '.smt': for ss in SampleSet.parse_smt(filename, sep): yield ss elif ext == '.json': for ss in SampleSet.parse_json(filename): yield ss else: logging.error('suffix not recognized (.smx, .smt, or .json)')
def db_ss_printJSON(ssea_dir, matrix_dir, ss_id): sample_sets_json_file = os.path.join(ssea_dir, 'sample_set.json') bm = BigCountMatrix.open(matrix_dir) samples = bm.colnames ss = SampleSet.parse_json(sample_sets_json_file)[0] membership = ss.get_array(samples) d = ss.to_dict(membership) d['_id'] = int(ss_id) print json.dumps(d)
def newcohort(args): sample_set_file = args.sample_set_file cohort_file = args.cohort_file sep = args.sep cohort_samples = set(line.strip() for line in open(cohort_file)) for ss in _parse_sample_sets(sample_set_file, sep): new_value_dict = {} hits = 0 for k,v in ss.value_dict.iteritems(): if k in cohort_samples: if v == 1: hits += 1 new_value_dict[k] = v if hits > 0: ss = SampleSet(ss.name, ss.desc, new_value_dict.items()) print ss.to_json() else: logging.warning('Sample set %s has no hits' % (ss.name))
def info(args): input_dirs = args.input_dirs if input_dirs is None: logging.error('Please specify SSEA directories using "-d" or "--dir"') return 1 for path in find_paths(input_dirs): sample_set_json_file = os.path.join(path, Config.SAMPLE_SET_JSON_FILE) sample_set = SampleSet.parse_json(sample_set_json_file)[0] compname = computerize_name(sample_set.name) print '\t'.join([compname, path])
def generate_random_sample_set(minsize, maxsize, samples): size = random.randint(minsize, maxsize) randsamples = random.sample(samples, size) randvalues = np.random.random_integers(0,1,size=size) values = [] for i in xrange(len(samples)): if samples[i] in randsamples: index = randsamples.index(samples[i]) values.append((samples[i],randvalues[index])) return SampleSet(name="SS", desc="Sample Set", values=values)
def subset(args): filename = args.sample_set_file sep = args.sep name = args.name desc = args.desc hit_set_names = args.hit_sets miss_set_names = args.miss_sets # check arguments if name is None: name = 'subset' if desc is None: desc = name if not hit_set_names or not miss_set_names: logging.error('Sample sets to be considered "hits" or "misses" ' 'should be specified using --hit and --miss') return 1 sample_sets = dict((ss.name,ss) for ss in _parse_sample_sets(filename, sep)) new_value_dict = {} for ss_name in hit_set_names: if ss_name not in sample_sets: logging.error('Sample set name "%s" not found.. Exiting.' % (ss_name)) return 1 ss = sample_sets[ss_name] for k,v in ss.value_dict.iteritems(): if v == 1: new_value_dict[k] = 1 for ss_name in miss_set_names: if ss_name not in sample_sets: logging.error('Sample set name "%s" not found.. Exiting.' % (ss_name)) return 1 ss = sample_sets[ss_name] for k,v in ss.value_dict.iteritems(): if v == 1: new_value_dict[k] = 0 ss = SampleSet(name, desc, new_value_dict.items()) print ss.to_json()
def merge(args): # get args input_paths_file = args.input_paths_file input_dirs = args.input_dirs output_dir = args.output_dir # check args input_paths = set() if input_paths_file is not None: if not os.path.exists(input_paths_file): logging.error('Input paths file "%s" not found' % (input_paths_file)) else: with open(input_paths_file) as fileh: for line in fileh: path = line.strip() if check_path(path): input_paths.add(path) if input_dirs is not None: input_paths.update(find_paths(input_dirs)) if len(input_paths) == 0: logging.error('No valid SSEA results directories found.. Exiting.') return 1 if not os.path.exists(output_dir): logging.debug('Creating output directory "%s"' % (output_dir)) os.makedirs(output_dir) # read paths already in output directory existing_paths = set() for path in glob.iglob(os.path.join(output_dir, "*")): if os.path.exists(path) and os.path.isdir(path): existing_paths.add(path) # merge input paths for src in input_paths: sample_set_json_file = os.path.join(src, Config.SAMPLE_SET_JSON_FILE) sample_set = SampleSet.parse_json(sample_set_json_file)[0] dirname = computerize_name(sample_set.name) dst = os.path.join(output_dir, dirname) if os.path.exists(dst): logging.error( 'Conflict when merging sample set name "%s" from path "%s"' % (sample_set.name, src)) else: logging.debug('Moving sample set "%s" from "%s" to "%s"' % (sample_set.name, src, dst)) shutil.move(src, dst)
def test_sample_set_smt_parser(self): # generate samples samples = ['S%d' % (i) for i in range(10000)] # generate sample sets N = 100 minsize = 1 maxsize = N sample_sets = [] for i in xrange(N): sample_sets.append(generate_random_sample_set(minsize,maxsize,samples)) # write to a temp file fileh = open('tmp', 'w') fields = ['Name', 'Description'] fields.extend(samples) print >>fileh, '\t'.join(fields) for i in xrange(len(sample_sets)): ss = sample_sets[i] fields = [ss.name, ss.desc] for j in xrange(len(samples)): if samples[j] in ss.value_dict: fields.append(ss.value_dict[samples[j]]) else: fields.append('') print >>fileh, '\t'.join(map(str,fields)) fileh.close() # read into sample sets read_sample_sets = SampleSet.parse_smt('tmp') self.assertTrue(len(read_sample_sets) == N) self.assertTrue(len(read_sample_sets) == len(sample_sets)) for i in xrange(N): ss = sample_sets[i] rss = read_sample_sets[i] self.assertEqual(rss.name, ss.name) self.assertEqual(rss.desc, ss.desc) self.assertTrue(set(rss.value_dict.items()) == set(ss.value_dict.items())) a = ss.get_array(samples) b = rss.get_array(samples) self.assertTrue(np.array_equal(a, b)) os.remove('tmp')
def merge(args): # get args input_paths_file = args.input_paths_file input_dirs = args.input_dirs output_dir = args.output_dir # check args input_paths = set() if input_paths_file is not None: if not os.path.exists(input_paths_file): logging.error('Input paths file "%s" not found' % (input_paths_file)) else: with open(input_paths_file) as fileh: for line in fileh: path = line.strip() if check_path(path): input_paths.add(path) if input_dirs is not None: input_paths.update(find_paths(input_dirs)) if len(input_paths) == 0: logging.error('No valid SSEA results directories found.. Exiting.') return 1 if not os.path.exists(output_dir): logging.debug('Creating output directory "%s"' % (output_dir)) os.makedirs(output_dir) # read paths already in output directory existing_paths = set() for path in glob.iglob(os.path.join(output_dir, "*")): if os.path.exists(path) and os.path.isdir(path): existing_paths.add(path) # merge input paths for src in input_paths: sample_set_json_file = os.path.join(src, Config.SAMPLE_SET_JSON_FILE) sample_set = SampleSet.parse_json(sample_set_json_file)[0] dirname = computerize_name(sample_set.name) dst = os.path.join(output_dir, dirname) if os.path.exists(dst): logging.error('Conflict when merging sample set name "%s" from path "%s"' % (sample_set.name, src)) else: logging.debug('Moving sample set "%s" from "%s" to "%s"' % (sample_set.name, src, dst)) shutil.move(src, dst)