-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_sas.py
executable file
·73 lines (56 loc) · 2.06 KB
/
filter_sas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
from pathlib import Path
from typing import FrozenSet, TypeVar
import sys
import pandas as pd
from pandas.io.sas.sas7bdat import SAS7BDATReader as SasReader
from pandas.io.stata import StataWriterUTF8 as StataWriter
T = TypeVar('T')
try:
_input_file_path: Path = Path(sys.argv[1]).absolute()
except IndexError:
sys.exit('Missing input file parameter')
_output_file_path: Path = _input_file_path.with_suffix('.dta')
_important_column: str = 'FOO'
_important_value: str = 'foo'
_wanted_columns: FrozenSet[str] = frozenset(('BAR',))
_chunksize: int = 1000000
def _is_interesting(row: pd.Series) -> bool:
return _important_value == row[[_important_column]]
class ProgressCounter:
def __init__(self, name: str, total: int):
self._name = name
self._progress: int = 0
self._count: int = 0
self._threshold: int = int(total / 100)
self._print_progress()
def __call__(self, *args: T) -> T:
self._count += 1
if self._threshold <= self._count:
self._progress += 1
self._count = 0
self._print_progress()
return args
def _print_progress(self):
print('{} progress: {} %'.format(self._name, self._progress), file=sys.stderr)
_sas_reader = SasReader(
_input_file_path,
encoding='latin-1',
chunksize=_chunksize
)
_number_of_observations = _sas_reader.row_count
print('Input has {} observations'.format(_number_of_observations), file=sys.stderr)
_extracted_columns = list(frozenset(_sas_reader.column_names).intersection(_wanted_columns))
_input_counter = ProgressCounter('input', _number_of_observations)
_input_row_generator = (_input_counter(row)
for chunk in _sas_reader
for index, row in chunk)
_output_row_generator = (row[_extracted_columns]
for row in filter(_is_interesting, _input_row_generator))
_result: pd.DataFrame = pd.DataFrame(_input_row_generator)
StataWriter(
_output_file_path,
_result,
write_index=False,
version=118
).write_file()