forked from lmaixner/not-a-database
/
SortGiantPileofSpreadsheets.py
165 lines (131 loc) · 5.25 KB
/
SortGiantPileofSpreadsheets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from __future__ import division, print_function
import os
import glob
from astropy.table import Table, Column, vstack
from astropy.coordinates import SkyCoord
from astropy import units as u
def __init__(self, tstuff):
self.__stuff = tstuff
def assign_id(file1, file2):
"""
Preconditions: Expects 2 files read as astropy Tables. Files must have RA
and Dec columns.
Postconditions: Fills the DataNum column in the second file with the
DataNum of the closest RA/Dec match in the first file.
"""
ra1 = file1['RA']
dec1 = file1['Dec']
ra2 = file2['RA']
dec2 = file2['Dec']
# returns two catalogs comparing file2 to file 1
c = SkyCoord(ra=ra1*u.degree, dec=dec1*u.degree)
catalog = SkyCoord(ra=ra2*u.degree, dec=dec2*u.degree)
idx, d2d, d3d = c.match_to_catalog_3d(catalog)
# some of the matches are likely to be duplicates and not within a
# reasonable distance to be the same star
# return an array of true's and false's where match is within specified
# range (2 arcsec)
good_matches = d2d < 2*u.arcsec
# get all matches that are within 2 arcsec of the target
idx2 = idx[good_matches]
# apply file1's dataname to file2's dataname at the indexes specified by
# idx2
file2['DataNum'][idx2] = file1['DataNum'][good_matches]
# now have 2 files with the DataName column matching for stars with RA/Dec
# close enough
return file2
def sort_files(files):
"""
Preconditions: Expects a list of csv files that have not yet been read as
tables
Postconditions: Returns the longest file,list with the longest file
removed. All of the files have two columns added. The SourceFile column
has the name of the file. The DataNum column is filled for the longest
file and has zeros for the rest.
"""
new_files = list(files)
new_files2 = list(new_files) # duplicate list
fileNames = []
for file in new_files:
base_name = os.path.basename(file)
fileNames.append(base_name)
ct = 0
ind = ct
file1 = Table.read(new_files[ct])
# find the longest red file to use for assigning indexes
for file in new_files:
file2 = Table.read(file)
if len(file2) > len(file1):
file1 = file2
ind = ct
ct += 1
# removes file1 from new_files
new_files.pop(ind)
new_files2.pop(ind)
# imports file as numpy Table
n_objects1 = len(file1)
# adds a DataNum Column to the table with sequential values to be match to
# the rest of the files
dataNum1_col = Column(data=range(1, n_objects1+1), name='DataNum')
file1.add_column(dataNum1_col)
# adds a SourceFile Column to the table
fileName_col = Column(data=[fileNames[ind]]*n_objects1, name='SourceFile')
file1.add_column(fileName_col)
# removes file1 name from it's position in fileNames list
del fileNames[ind]
# run through the rest of the files, adds Filename and DataNum column and
# assigns them DataNums concurrent with file1's
ct = 0
for file in new_files:
cur_file = Table.read(file)
# imports file as numpy Table
file2 = Table.read(new_files2[ct])
n_objects2 = len(file2)
# adds a DataNum Column to the table with 0000 values to be matched to
# file1 values
dataNum2_col = Column(data=[0000]*n_objects2, name='DataNum')
cur_file.add_column(dataNum2_col)
fileName_col = Column(data=[fileNames[ct]]*n_objects2, name='SourceFile')
cur_file.add_column(fileName_col)
new_files[ct] = assign_id(file1, cur_file)
ct += 1
return file1, new_files
def f_group(filename):
"""
Preconditions: Must have the filter type letter as the last letter of the
filename. Requires the directory location of the csv files
Postconditions: Returns one big file with all files for each image added
as rows.
"""
files = glob.glob(filename)
file1, files2 = sort_files(files)
# creates new csv file to pile all the new files onto
big_file = file1
# and add all files to big_file
for file in files2:
big_file = vstack([big_file, file], join_type='exact')
return big_file
def group_by_filter(f_ext, object, filters=['I', 'R', 'V', 'B'], target_dir='output'):
"""
Preconditions: Must have the filter type letter as the last letter of the
filename. Requires the directory location of the csv files and the name of
the object being observed for naming the output file. Can also take the
filter types to look for as a list and the output directory to put the
files in.
Postconditions: Creates one csv file (named for observed object and
filter) for each filter type with the csv files for each image with
columns DataNum (matched by RA/Dec) and SourceFile.
"""
try:
os.mkdir(target_dir)
except WindowsError as e:
if 'Cannot create a file when that file already exists' in e.strerror:
pass
else:
raise
pattern = f_ext + '\*{}.csv'
# send all the files to f_group for each filter
for filter in filters:
big_file = f_group(pattern.format(filter))
# outputs table of located object's info in .csv format
big_file.write(os.path.join(target_dir, object+filter+'Filt.csv'))