Example #1
0
def merge(args):
    filename = args.sample_set_file
    sep = args.sep
    name = args.name
    desc = args.desc
    sample_set_names = args.sample_set_names
    # check arguments
    if name is None:
        name = 'merge'
    if desc is None:
        desc = name
    if not sample_set_names:
        logging.error('No sample set names specified')
        return 1
    sample_sets = dict((ss.name,ss) for ss in _parse_sample_sets(filename, sep))
    new_value_dict = collections.defaultdict(lambda: 0)
    for ss_name in sample_set_names:
        if ss_name not in sample_sets:
            logging.error('Sample set name "%s" not found.. Exiting.' % (ss_name))
            return 1
        ss = sample_sets[ss_name]
        for k,v in ss.value_dict.iteritems():
            if k in new_value_dict:
                continue
            new_value_dict[k] = v
    ss = SampleSet(name, desc, new_value_dict.items())
    print ss.to_json()
Example #2
0
def _parse_sample_sets(filename, sep):
    if not os.path.exists(filename):
        logging.error("Sample set file '%s' not found" % (filename))
        sys.exit(1)
    ext = os.path.splitext(filename)[-1]
    if ext == '.smx':        
        for ss in SampleSet.parse_smx(filename, sep):
            yield ss
    elif ext == '.smt':
        for ss in SampleSet.parse_smt(filename, sep):
            yield ss
    elif ext == '.json':
        for ss in SampleSet.parse_json(filename):
            yield ss
    else:
        logging.error('suffix not recognized (.smx, .smt, or .json)')
Example #3
0
def db_ss_printJSON(ssea_dir, matrix_dir, ss_id):
    sample_sets_json_file = os.path.join(ssea_dir, 'sample_set.json')
    bm = BigCountMatrix.open(matrix_dir)
    samples = bm.colnames
    ss = SampleSet.parse_json(sample_sets_json_file)[0]
    membership = ss.get_array(samples)
    d = ss.to_dict(membership)
    d['_id'] = int(ss_id)
    print json.dumps(d)
Example #4
0
def newcohort(args):
    sample_set_file = args.sample_set_file
    cohort_file = args.cohort_file
    sep = args.sep
    cohort_samples = set(line.strip() for line in open(cohort_file))
    for ss in _parse_sample_sets(sample_set_file, sep):
        new_value_dict = {}
        hits = 0
        for k,v in ss.value_dict.iteritems(): 
            if k in cohort_samples:
                if v == 1:
                    hits += 1
                new_value_dict[k] = v
        if hits > 0:
            ss = SampleSet(ss.name, ss.desc, new_value_dict.items())
            print ss.to_json()
        else:
            logging.warning('Sample set %s has no hits' % (ss.name))
Example #5
0
def db_ss_printJSON(ssea_dir, matrix_dir, ss_id):
    sample_sets_json_file = os.path.join(ssea_dir,
                                         'sample_set.json')
    bm = BigCountMatrix.open(matrix_dir)
    samples = bm.colnames
    ss = SampleSet.parse_json(sample_sets_json_file)[0]
    membership = ss.get_array(samples)
    d = ss.to_dict(membership)
    d['_id'] = int(ss_id)
    print json.dumps(d)
Example #6
0
def info(args):
    input_dirs = args.input_dirs
    if input_dirs is None:
        logging.error('Please specify SSEA directories using "-d" or "--dir"')
        return 1
    for path in find_paths(input_dirs):
        sample_set_json_file = os.path.join(path, Config.SAMPLE_SET_JSON_FILE)
        sample_set = SampleSet.parse_json(sample_set_json_file)[0]
        compname = computerize_name(sample_set.name)
        print '\t'.join([compname, path])
Example #7
0
def info(args):
    input_dirs = args.input_dirs
    if input_dirs is None:
        logging.error('Please specify SSEA directories using "-d" or "--dir"')
        return 1
    for path in find_paths(input_dirs):
        sample_set_json_file = os.path.join(path, Config.SAMPLE_SET_JSON_FILE)
        sample_set = SampleSet.parse_json(sample_set_json_file)[0]
        compname = computerize_name(sample_set.name)
        print '\t'.join([compname, path])        
Example #8
0
def generate_random_sample_set(minsize, maxsize, samples):
    size = random.randint(minsize, maxsize)
    randsamples = random.sample(samples, size)
    randvalues = np.random.random_integers(0,1,size=size)
    values = []
    for i in xrange(len(samples)):
        if samples[i] in randsamples:
            index = randsamples.index(samples[i])
            values.append((samples[i],randvalues[index]))
    return SampleSet(name="SS",
                     desc="Sample Set",
                     values=values)
Example #9
0
def subset(args):
    filename = args.sample_set_file
    sep = args.sep
    name = args.name
    desc = args.desc
    hit_set_names = args.hit_sets
    miss_set_names = args.miss_sets
    # check arguments
    if name is None:
        name = 'subset'
    if desc is None:
        desc = name
    if not hit_set_names or not miss_set_names:
        logging.error('Sample sets to be considered "hits" or "misses" '
                      'should be specified using --hit and --miss')
        return 1
    sample_sets = dict((ss.name,ss) for ss in _parse_sample_sets(filename, sep))
    new_value_dict = {}
    for ss_name in hit_set_names:
        if ss_name not in sample_sets:
            logging.error('Sample set name "%s" not found.. Exiting.' % (ss_name))
            return 1
        ss = sample_sets[ss_name]
        for k,v in ss.value_dict.iteritems():
            if v == 1:
                new_value_dict[k] = 1
    for ss_name in miss_set_names:
        if ss_name not in sample_sets:
            logging.error('Sample set name "%s" not found.. Exiting.' % (ss_name))
            return 1
        ss = sample_sets[ss_name]
        for k,v in ss.value_dict.iteritems():
            if v == 1:
                new_value_dict[k] = 0
    ss = SampleSet(name, desc, new_value_dict.items())
    print ss.to_json()
Example #10
0
def merge(args):
    # get args
    input_paths_file = args.input_paths_file
    input_dirs = args.input_dirs
    output_dir = args.output_dir
    # check args
    input_paths = set()
    if input_paths_file is not None:
        if not os.path.exists(input_paths_file):
            logging.error('Input paths file "%s" not found' %
                          (input_paths_file))
        else:
            with open(input_paths_file) as fileh:
                for line in fileh:
                    path = line.strip()
                    if check_path(path):
                        input_paths.add(path)
    if input_dirs is not None:
        input_paths.update(find_paths(input_dirs))
    if len(input_paths) == 0:
        logging.error('No valid SSEA results directories found.. Exiting.')
        return 1
    if not os.path.exists(output_dir):
        logging.debug('Creating output directory "%s"' % (output_dir))
        os.makedirs(output_dir)
    # read paths already in output directory
    existing_paths = set()
    for path in glob.iglob(os.path.join(output_dir, "*")):
        if os.path.exists(path) and os.path.isdir(path):
            existing_paths.add(path)
    # merge input paths
    for src in input_paths:
        sample_set_json_file = os.path.join(src, Config.SAMPLE_SET_JSON_FILE)
        sample_set = SampleSet.parse_json(sample_set_json_file)[0]
        dirname = computerize_name(sample_set.name)
        dst = os.path.join(output_dir, dirname)
        if os.path.exists(dst):
            logging.error(
                'Conflict when merging sample set name "%s" from path "%s"' %
                (sample_set.name, src))
        else:
            logging.debug('Moving sample set "%s" from "%s" to "%s"' %
                          (sample_set.name, src, dst))
            shutil.move(src, dst)
Example #11
0
 def test_sample_set_smt_parser(self):
     # generate samples
     samples = ['S%d' % (i) for i in range(10000)]
     # generate sample sets
     N = 100
     minsize = 1
     maxsize = N
     sample_sets = []
     for i in xrange(N):
         sample_sets.append(generate_random_sample_set(minsize,maxsize,samples))
     # write to a temp file
     fileh = open('tmp', 'w')
     fields = ['Name', 'Description']
     fields.extend(samples)
     print >>fileh, '\t'.join(fields)
     for i in xrange(len(sample_sets)):
         ss = sample_sets[i]
         fields = [ss.name, ss.desc]
         for j in xrange(len(samples)):
             if samples[j] in ss.value_dict:
                 fields.append(ss.value_dict[samples[j]])
             else:
                 fields.append('')
         print >>fileh, '\t'.join(map(str,fields))
     fileh.close()
     # read into sample sets
     read_sample_sets = SampleSet.parse_smt('tmp')
     self.assertTrue(len(read_sample_sets) == N)
     self.assertTrue(len(read_sample_sets) == len(sample_sets))
     for i in xrange(N):
         ss = sample_sets[i]
         rss = read_sample_sets[i]
         self.assertEqual(rss.name, ss.name)
         self.assertEqual(rss.desc, ss.desc)
         self.assertTrue(set(rss.value_dict.items()) == 
                         set(ss.value_dict.items()))
         a = ss.get_array(samples)
         b = rss.get_array(samples)
         self.assertTrue(np.array_equal(a, b))            
     os.remove('tmp')
Example #12
0
def merge(args):
    # get args
    input_paths_file = args.input_paths_file
    input_dirs = args.input_dirs
    output_dir = args.output_dir
    # check args
    input_paths = set()
    if input_paths_file is not None:
        if not os.path.exists(input_paths_file):
            logging.error('Input paths file "%s" not found' % (input_paths_file))
        else:
            with open(input_paths_file) as fileh:
                for line in fileh:
                    path = line.strip()
                    if check_path(path):
                        input_paths.add(path)
    if input_dirs is not None:
        input_paths.update(find_paths(input_dirs))
    if len(input_paths) == 0:
        logging.error('No valid SSEA results directories found.. Exiting.')
        return 1
    if not os.path.exists(output_dir):
        logging.debug('Creating output directory "%s"' % (output_dir))
        os.makedirs(output_dir)
    # read paths already in output directory
    existing_paths = set()
    for path in glob.iglob(os.path.join(output_dir, "*")):
        if os.path.exists(path) and os.path.isdir(path):
            existing_paths.add(path)
    # merge input paths
    for src in input_paths:
        sample_set_json_file = os.path.join(src, Config.SAMPLE_SET_JSON_FILE)
        sample_set = SampleSet.parse_json(sample_set_json_file)[0]
        dirname = computerize_name(sample_set.name)
        dst = os.path.join(output_dir, dirname)
        if os.path.exists(dst):
            logging.error('Conflict when merging sample set name "%s" from path "%s"' % (sample_set.name, src))
        else:
            logging.debug('Moving sample set "%s" from "%s" to "%s"' % (sample_set.name, src, dst))
            shutil.move(src, dst)