Beispiel #1
0
    # Plot time series and best shapelet onto '<class>_shapelet.eps'
    # 1) Open and read in lc from the source file

    testing_dir = LC_DIR + '/' + TEST_DIR
    # sample 3 for each file type
    class_tests = {}
    for fname in os.listdir(testing_dir):
        fclass = fname.split('_')[0]
        if fclass not in class_tests.keys():
            class_tests[fclass] = [fname]
        else:
            class_tests[fclass].append(fname)

    for classname in class_tests.keys():
        for fname in random.sample(class_tests[classname], 3):
            test_lc = file_to_lc(LC_DIR + '/' + TEST_DIR + '/' + fname)
            test_time = test_lc.time
            test_flux = test_lc.flux
            test_class = fname.split('/')[-1].split('_')[0]
            new_time, new_flux = ([], [])
            for i in xrange(len(test_flux)):
                if test_flux[i] != '-':
                    new_time.append(test_time[i])
                    new_flux.append(test_flux[i])
            plt.plot(new_time, new_flux, 'xk')
            colors = ['r', 'b', 'g', 'c', 'm', 'y']
            styles = ['-', '-.']
            print "test class:", test_class
            legtext = 'Original TS (class {0})'.format(test_class)
            legends = [legtext]
            for sh_num, sh_class in enumerate(best_line.keys()):
Beispiel #2
0
import lightcurve
import features
import utils

LC_PATH = 'lightcurves/norm_n1.5_a100_m0_s400/'
lc = lightcurve.file_to_lc(LC_PATH + 'SNe_wide_25.data')
print "SNe"
print['{0}'.format(obj) for obj in features.time_flux(lc)]
#print [round(obj, 3) for obj in features.time_flux(lc)]
print "ESE"
lc = lightcurve.file_to_lc(LC_PATH + 'ESE_wide_25.data')
print[round(obj, 3) for obj in features.time_flux(lc)]

print "IDV"
lc = lightcurve.file_to_lc(LC_PATH + 'IDV_wide_25.data')
print[round(obj, 3) for obj in features.time_flux(lc)]
print "Novae"
lc = lightcurve.file_to_lc(LC_PATH + 'Novae_wide_25.data')
print[round(obj, 3) for obj in features.time_flux(lc)]
def expdir_to_arff(lc_files, dyncache, dyncache_keyset, exp_dir, arff_fname):
	# Load up the description of each feature (name and #) to use to write the arff
	featdesc_file = open(FEATDESC_FNAME)
	feat_names = []
	feat_counts = {}
	for line in featdesc_file:
		if line[0] == '#':
			continue
		line = line.strip().split('\t')
		feat_names.append(line[0])
		feat_counts[line[0]] = int(line[1])
	# and the classes
	class_file = open(CLASS_FNAME)
	classes = []
	for line in class_file:
		line = line.strip()
		classes.append(line)
		
	# produce the filename and its header
	arff_file = open(arff_fname, 'w')
	arff_file.write("% Light curve classification features\n\n")
	arff_file.write("@RELATION {0}\n\n".format(exp_dir))
	for feat_name in feat_names:
		if feat_counts[feat_name] == 1: # only 1 feature
			arff_file.write('@ATTRIBUTE {0} NUMERIC\n'.format(feat_name))
		else:
			for i in xrange(feat_counts[feat_name]):
				arff_file.write('@ATTRIBUTE {0}{1} NUMERIC\n'.format(feat_name, str(i)))
	arff_file.write('@ATTRIBUTE class {' + ', '.join(classes) + '}\n\n')
	arff_file.write('@DATA\n')
	
	# extract features if not in cache and append
	# TODO replace cache_file = open(CACHE_FNAME, 'a')
	to_process = len(lc_files)
	lc_file = None
#	try: # to stop corruption of the cache
	increment = int(round((to_process / 10)))
	done = 0
	
	conn = sqlite3.connect('feat_cache.db')
	c = conn.cursor()

	for lc_file in lc_files:
		#print lc_file
		if done % increment == 0 and done != 0:
			print "{0}/{1}".format(done, len(lc_files))
		done += 1

		# look for cache hit
		
		lc_class = lc_file.split('_')[0]
		features = None
		lc_path = '{0}/{1}/{2}'.format(LC_DIR, exp_dir, lc_file)
		#print "extracting features from:", lc_path
		
		# check to see if features are in dynamic cache first
		if lc_path in dyncache_keyset:
			features = dyncache[lc_path]
		else: # do db lookup
			search_cursor = c.execute('''select * from featcache where key=?''', [lc_path])
			search_result = search_cursor.fetchall()
			if len(search_result) == 0: # cache miss, extract features
				print "db miss"
				lc = file_to_lc(lc_path)
				features = lc_to_features(lc)
				c.execute('''insert into featcache values {0}'''.format(tuple([lc_path] + features)))
			else:
				features = search_result[0][1:] # fetch features and remove key
			# either if extracted or fetched from db, add to dynamic cache
			dyncache[lc_path] = features
			dyncache_keyset.add(lc_path)
		# finally, write in the features
		arff_file.write(','.join([str(obj) for obj in features]) + ',' + lc_class + '\n')
	conn.commit()
	conn.close()
	arff_file.close()
	return (dyncache, dyncache_keyset)
Beispiel #4
0
import features
import lightcurve
import sys
classtype = sys.argv[1]
num = sys.argv[2]
lc_path = 'lightcurves/norm_n1.5_a100_m0_s400/{0}_wide_{1}.data'.format(
    classtype, num)
hc = features.time_flux(lightcurve.file_to_lc(lc_path))[-22:]
print hc[:12]
print hc[12:]
hc = features.flux_only(lightcurve.file_to_lc(lc_path))[-22:]
print hc[:12]
print hc[12:]
Beispiel #5
0
import lightcurve
import features
import utils

LC_PATH = 'lightcurves/norm_n1.5_a100_m0_s400/'
lc = lightcurve.file_to_lc(LC_PATH + 'SNe_wide_25.data')
print "SNe"
print ['{0}'.format(obj) for obj in features.time_flux(lc)]
#print [round(obj, 3) for obj in features.time_flux(lc)]
print "ESE"
lc = lightcurve.file_to_lc(LC_PATH + 'ESE_wide_25.data')
print [round(obj, 3) for obj in features.time_flux(lc)]

print "IDV"
lc = lightcurve.file_to_lc(LC_PATH + 'IDV_wide_25.data')
print [round(obj, 3) for obj in features.time_flux(lc)]
print "Novae"
lc = lightcurve.file_to_lc(LC_PATH + 'Novae_wide_25.data')
print [round(obj, 3) for obj in features.time_flux(lc)]
Beispiel #6
0
def shapelet_features(apply_dir, args):
    # Get the parameters associated with the shapelet arguments given from the expt
    params = getshoutdir.getshfeatdir(args)
    print "params:", params
    # This is the diredtory containing the processed shapelets for the arguments
    shapelet_featureset = params[0]
    shapelet_feature_path = "shapelet_features/{0}".format(shapelet_featureset)
    print "extracting shapelet features using shapelets in:", shapelet_feature_path

    if not os.path.isdir('{0}/{1}'.format("raw_features",
                                          shapelet_featureset)):
        os.mkdir('{0}/{1}'.format("raw_features", shapelet_featureset))
    # This is the directory where the resulting features are going
    feature_out_dir = "{0}/{1}/{2}".format(
        RAW_FEAT_DIR, shapelet_featureset,
        apply_dir)  # zeroth element is directory name
    print "features extracted to:", feature_out_dir
    if os.path.isdir(feature_out_dir):
        print "directory already exists", feature_out_dir
        return  # do not extract
    else:
        print "creating directory:", feature_out_dir
        os.mkdir(feature_out_dir)
    use_dtw = params[1]
    use_md = params[2]
    best_amt = params[3]
    dist_func = None
    if use_md:
        dist_func = distances.mindist
    elif use_dtw:
        dist_func = distances.dtw
    else:
        print "error!, no distance measure being used"

    # This is the lightcurve directory to which we apply the shapelets
    apply_dir = LC_DIR + "/" + apply_dir
    for cfnum in xrange(NUM_CROSSFOLDS):
        print "crossfold", cfnum
        #	cf_best = utils.best_shapelets(crossfold + "/cf{0}".format(cfnum))
        test_list = "crossfold/cf{0}/test".format(cfnum)
        for fnum, fname in enumerate(open(test_list)):
            if fnum % 10 == 0:
                print "{0} files processed".format(fnum)
            fname = fname.strip()
            extract_file = apply_dir + "/" + fname
            # Open extraction file to lc
            extract_from = lightcurve.file_to_lc(extract_file)
            features = []
            # Load all the shapelets from shapelet_features/dir/cf(num) and find distances
            shapelet_source_dir = "{0}/cf{1}".format(shapelet_feature_path,
                                                     cfnum)
            for shapelet_filename in os.listdir(shapelet_source_dir):
                # Get the shapelet cotents and apply the distance measure
                shapelet_as_lc = lightcurve.file_to_lc(
                    shapelet_feature_path + '/cf{0}/'.format(cfnum) +
                    shapelet_filename)

                measure_with_flux = shapelet_as_lc.flux
                if len(measure_with_flux) == 0:
                    print "missing a shapelet file:", shapelet_feature_path + '/cf{0}/'.format(
                        cfnum) + shapelet_filename
                    continue
                distance = dist_func(extract_from.flux, measure_with_flux)[0]
                features.append(distance)
            # Finally, write out all the features
            feat_outfname = feature_out_dir + "/" + fname
            feat_outfile = open(feat_outfname, 'w')
            feat_outfile.write(','.join([str(o) for o in features]))
            feat_outfile.close()
Beispiel #7
0
            if update:
                best_line[sh_class] = line
                best[sh_class] = line[-2]
                best_SD[sh_class] = line[-1]

        # Write these out to the appropriate directory
        out_dir = "{0}/cf{1}".format(SHAPELET_FEATURE_DIR, cfnum)
        if not os.path.isdir(out_dir):
            os.mkdir(out_dir)
        debug_dir = "{0}/cf{1}".format(SHAPELET_DEBUG_DIR, cfnum)
        if not os.path.isdir(debug_dir):
            os.mkdir(debug_dir)
        for sh_class in best_line.keys():
            print "class:", sh_class, "id:", best_line[sh_class][0]
            source_filename = best_line[sh_class][1].split('/')[-1]
            source = lightcurve.file_to_lc('{0}/{1}'.format(
                SHAPELET_SOURCE_DIR, source_filename))
            sh_start = int(best_line[sh_class][2])
            sh_end = int(best_line[sh_class][3]) + sh_start
            debug_index.write('{0},{1}\n'.format(sh_class,
                                                 best_line[sh_class][0]))
            out_file = open("{0}/{1}_shapelet_{2}.data".format(
                out_dir, sh_class, cfnum), 'w')  # c is the class
            print "writing to file:", out_file
            for t, f in izip(source.time[sh_start:sh_end],
                             source.flux[sh_start:sh_end]):
                out_file.write('{0}\t{1}\n'.format(t, f))
            out_file.close()
            plt.plot(source.time[:sh_start], source.flux[:sh_start], 'k',
                     source.time[sh_end:], source.flux[sh_end:], 'k',
                     source.time[sh_start:sh_end],
                     source.flux[sh_start:sh_end], 'r')
Beispiel #8
0
		# Extract features from every light curve in training directory
		print "extracting features for", exp_feat_dir
		for tf, train_test in enumerate([train, test]): # just for convenience
			# Extract shapelets if necessary (external step to other feature extraction)
			if "-" in feat_id or 'shapelet' in feat_id: # ugh
				print "extracting shapelet features for directory:", train_test
				#if tf == 0: # if we are computing the training set OH GOD SO HACKED
				#	print comp_features[feat_id]
				#	if comp_features[feat_id][1] != 'None': # if there is a forced train set
				#		print train_test
				#		train_test = comp_features[feat_id][1]
				#		print "using forced training set:", train_test
				shapelet_features(train_test, comp_features[feat_id][0]) # extract all shapelets for train_test with args
				continue # do not proceed (what would we do anyway?)
			outdir = "{0}/{1}".format(exp_feat_dir, train_test)
			if os.path.isdir(outdir):
				print "features already extracted to", outdir, "skipping"
				continue
			os.mkdir(outdir)
			
			lcs_to_extract = os.listdir("{0}/{1}".format(LC_DIR, train_test))
			for fname in lcs_to_extract:
				outfile = open("{0}/{1}".format(outdir,fname), 'w')
				if fname == ".DS_Store": # skip this stupid shit
					continue
				data_fname = "{0}/{1}/{2}".format(LC_DIR, train_test, fname)
				lc = lightcurve.file_to_lc(data_fname)
				features = [data_fname] + apply(eval(feat_id),[lc])
				outfile.write(','.join([str(o) for o in features]) + '\n')
			outfile.close()
Beispiel #9
0
	# Plot time series and best shapelet onto '<class>_shapelet.eps'
	# 1) Open and read in lc from the source file
	
	testing_dir = LC_DIR + '/' + TEST_DIR
	# sample 3 for each file type
	class_tests = {}
	for fname in os.listdir(testing_dir):
		fclass = fname.split('_')[0]
		if fclass not in class_tests.keys():
			class_tests[fclass] = [fname]
		else:
			class_tests[fclass].append(fname)
	
	for classname in class_tests.keys():
		for fname in random.sample(class_tests[classname], 3):
			test_lc = file_to_lc(LC_DIR + '/' + TEST_DIR + '/' + fname)
			test_time = test_lc.time
			test_flux = test_lc.flux
			test_class = fname.split('/')[-1].split('_')[0]
			new_time, new_flux = ([], [])
			for i in xrange(len(test_flux)):
				if test_flux[i] != '-':
					new_time.append(test_time[i])
					new_flux.append(test_flux[i])
			plt.plot(new_time, new_flux, 'xk')
			colors = ['r', 'b', 'g', 'c', 'm', 'y']
			styles = ['-', '-.']
			print "test class:", test_class
			legtext = 'Original TS (class {0})'.format(test_class)
			legends = [legtext]
			for sh_num, sh_class in enumerate(best_line.keys()):
Beispiel #10
0
import features
import lightcurve
import sys
classtype = sys.argv[1]
num = sys.argv[2]
lc_path = 'lightcurves/norm_n1.5_a100_m0_s400/{0}_wide_{1}.data'.format(classtype, num)
hc = features.time_flux(lightcurve.file_to_lc(lc_path))[-22:] 
print hc[:12]
print hc[12:]
hc = features.flux_only(lightcurve.file_to_lc(lc_path))[-22:] 
print hc[:12]
print hc[12:]
Beispiel #11
0
def shapelet_features(apply_dir, args):
	# Get the parameters associated with the shapelet arguments given from the expt
	params = getshoutdir.getshfeatdir(args)
	print "params:", params
	# This is the diredtory containing the processed shapelets for the arguments
	shapelet_featureset = params[0]
	shapelet_feature_path = "shapelet_features/{0}".format(shapelet_featureset)
	print "extracting shapelet features using shapelets in:", shapelet_feature_path

	if not os.path.isdir('{0}/{1}'.format("raw_features", shapelet_featureset)):
		os.mkdir('{0}/{1}'.format("raw_features", shapelet_featureset))
	# This is the directory where the resulting features are going
	feature_out_dir = "{0}/{1}/{2}".format(RAW_FEAT_DIR, shapelet_featureset, apply_dir) # zeroth element is directory name
	print "features extracted to:", feature_out_dir
	if os.path.isdir(feature_out_dir):
		print "directory already exists", feature_out_dir
		return # do not extract
	else:
		print "creating directory:", feature_out_dir
		os.mkdir(feature_out_dir)
	use_dtw = params[1]
	use_md = params[2]
	best_amt = params[3]
	dist_func = None
	if use_md:
		dist_func = distances.mindist
	elif use_dtw:
		dist_func = distances.dtw
	else:
		print "error!, no distance measure being used"
	
	# This is the lightcurve directory to which we apply the shapelets
	apply_dir = LC_DIR + "/" + apply_dir
	for cfnum in xrange(NUM_CROSSFOLDS):
		print "crossfold", cfnum
		#	cf_best = utils.best_shapelets(crossfold + "/cf{0}".format(cfnum))
		test_list = "crossfold/cf{0}/test".format(cfnum)
		for fnum, fname in enumerate(open(test_list)):
			if fnum % 10 == 0:
				print "{0} files processed".format(fnum)
			fname = fname.strip()
			extract_file = apply_dir + "/" + fname
			# Open extraction file to lc
			extract_from = lightcurve.file_to_lc(extract_file)
			features = []
			# Load all the shapelets from shapelet_features/dir/cf(num) and find distances
			shapelet_source_dir = "{0}/cf{1}".format(shapelet_feature_path, cfnum)
			for shapelet_filename in os.listdir(shapelet_source_dir):
				# Get the shapelet cotents and apply the distance measure
				shapelet_as_lc = lightcurve.file_to_lc(shapelet_feature_path + '/cf{0}/'.format(cfnum) + shapelet_filename)
				
				measure_with_flux = shapelet_as_lc.flux
				if len(measure_with_flux) == 0:
					print "missing a shapelet file:", shapelet_feature_path + '/cf{0}/'.format(cfnum) + shapelet_filename        
					continue
				distance = dist_func(extract_from.flux, measure_with_flux)[0]
				features.append(distance)
			# Finally, write out all the features
			feat_outfname = feature_out_dir + "/"  + fname
			feat_outfile = open(feat_outfname, 'w')
			feat_outfile.write(','.join([str(o) for o in features]))
			feat_outfile.close()
Beispiel #12
0
        for tf, train_test in enumerate([train, test]):  # just for convenience
            # Extract shapelets if necessary (external step to other feature extraction)
            if "-" in feat_id or 'shapelet' in feat_id:  # ugh
                print "extracting shapelet features for directory:", train_test
                #if tf == 0: # if we are computing the training set OH GOD SO HACKED
                #	print comp_features[feat_id]
                #	if comp_features[feat_id][1] != 'None': # if there is a forced train set
                #		print train_test
                #		train_test = comp_features[feat_id][1]
                #		print "using forced training set:", train_test
                shapelet_features(
                    train_test, comp_features[feat_id]
                    [0])  # extract all shapelets for train_test with args
                continue  # do not proceed (what would we do anyway?)
            outdir = "{0}/{1}".format(exp_feat_dir, train_test)
            if os.path.isdir(outdir):
                print "features already extracted to", outdir, "skipping"
                continue
            os.mkdir(outdir)

            lcs_to_extract = os.listdir("{0}/{1}".format(LC_DIR, train_test))
            for fname in lcs_to_extract:
                outfile = open("{0}/{1}".format(outdir, fname), 'w')
                if fname == ".DS_Store":  # skip this stupid shit
                    continue
                data_fname = "{0}/{1}/{2}".format(LC_DIR, train_test, fname)
                lc = lightcurve.file_to_lc(data_fname)
                features = [data_fname] + apply(eval(feat_id), [lc])
                outfile.write(','.join([str(o) for o in features]) + '\n')
            outfile.close()
Beispiel #13
0
def expdir_to_arff(lc_files, dyncache, dyncache_keyset, exp_dir, arff_fname):
    # Load up the description of each feature (name and #) to use to write the arff
    featdesc_file = open(FEATDESC_FNAME)
    feat_names = []
    feat_counts = {}
    for line in featdesc_file:
        if line[0] == '#':
            continue
        line = line.strip().split('\t')
        feat_names.append(line[0])
        feat_counts[line[0]] = int(line[1])
    # and the classes
    class_file = open(CLASS_FNAME)
    classes = []
    for line in class_file:
        line = line.strip()
        classes.append(line)

    # produce the filename and its header
    arff_file = open(arff_fname, 'w')
    arff_file.write("% Light curve classification features\n\n")
    arff_file.write("@RELATION {0}\n\n".format(exp_dir))
    for feat_name in feat_names:
        if feat_counts[feat_name] == 1:  # only 1 feature
            arff_file.write('@ATTRIBUTE {0} NUMERIC\n'.format(feat_name))
        else:
            for i in xrange(feat_counts[feat_name]):
                arff_file.write('@ATTRIBUTE {0}{1} NUMERIC\n'.format(
                    feat_name, str(i)))
    arff_file.write('@ATTRIBUTE class {' + ', '.join(classes) + '}\n\n')
    arff_file.write('@DATA\n')

    # extract features if not in cache and append
    # TODO replace cache_file = open(CACHE_FNAME, 'a')
    to_process = len(lc_files)
    lc_file = None
    #	try: # to stop corruption of the cache
    increment = int(round((to_process / 10)))
    done = 0

    conn = sqlite3.connect('feat_cache.db')
    c = conn.cursor()

    for lc_file in lc_files:
        #print lc_file
        if done % increment == 0 and done != 0:
            print "{0}/{1}".format(done, len(lc_files))
        done += 1

        # look for cache hit

        lc_class = lc_file.split('_')[0]
        features = None
        lc_path = '{0}/{1}/{2}'.format(LC_DIR, exp_dir, lc_file)
        #print "extracting features from:", lc_path

        # check to see if features are in dynamic cache first
        if lc_path in dyncache_keyset:
            features = dyncache[lc_path]
        else:  # do db lookup
            search_cursor = c.execute(
                '''select * from featcache where key=?''', [lc_path])
            search_result = search_cursor.fetchall()
            if len(search_result) == 0:  # cache miss, extract features
                print "db miss"
                lc = file_to_lc(lc_path)
                features = lc_to_features(lc)
                c.execute('''insert into featcache values {0}'''.format(
                    tuple([lc_path] + features)))
            else:
                features = search_result[0][
                    1:]  # fetch features and remove key
            # either if extracted or fetched from db, add to dynamic cache
            dyncache[lc_path] = features
            dyncache_keyset.add(lc_path)
        # finally, write in the features
        arff_file.write(','.join([str(obj) for obj in features]) + ',' +
                        lc_class + '\n')
    conn.commit()
    conn.close()
    arff_file.close()
    return (dyncache, dyncache_keyset)
Beispiel #14
0
			if update:
				best_line[sh_class] = line
				best[sh_class] = line[-2]
				best_SD[sh_class] = line[-1]

		# Write these out to the appropriate directory
		out_dir = "{0}/cf{1}".format(SHAPELET_FEATURE_DIR, cfnum)
		if not os.path.isdir(out_dir):
			os.mkdir(out_dir)
		debug_dir = "{0}/cf{1}".format(SHAPELET_DEBUG_DIR, cfnum)
		if not os.path.isdir(debug_dir):
			os.mkdir(debug_dir)
		for sh_class in best_line.keys():
			print "class:", sh_class, "id:", best_line[sh_class][0]
			source_filename = best_line[sh_class][1].split('/')[-1]
			source = lightcurve.file_to_lc('{0}/{1}'.format(SHAPELET_SOURCE_DIR, source_filename))
			sh_start = int(best_line[sh_class][2])
			sh_end = int(best_line[sh_class][3]) + sh_start
			debug_index.write('{0},{1}\n'.format(sh_class, best_line[sh_class][0]))
			out_file = open("{0}/{1}_shapelet_{2}.data".format(out_dir, sh_class, cfnum), 'w') # c is the class
			print "writing to file:", out_file
			for t, f in izip(source.time[sh_start:sh_end], source.flux[sh_start:sh_end]):
				out_file.write('{0}\t{1}\n'.format(t,f))
			out_file.close()
			plt.plot(source.time[:sh_start], source.flux[:sh_start], 'k', source.time[sh_end:], source.flux[sh_end:], 'k', source.time[sh_start:sh_end], source.flux[sh_start:sh_end], 'r')
			plt.xlabel('Time (days)')
			plt.ylabel('Flux (mJy, normalised)')

			plt.savefig("{0}/{1}".format(debug_dir,\
				'{0}_shapelet{1}.pdf'.format(sh_class, 1), format="pdf"))
			plt.close()