-
Notifications
You must be signed in to change notification settings - Fork 0
/
xls_to_csv.py
44 lines (30 loc) · 1.07 KB
/
xls_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from bs4 import BeautifulSoup
import pandas
import os
import glob
from utils import remove_duplicates, clean
os.chdir("./xls")
extension = 'xls'
file_names = [i for i in glob.glob('*.{}'.format(extension))]
for file_name in file_names:
f = open(file_name, 'r', encoding='ANSI')
soup = BeautifulSoup(f, 'html.parser')
overview = soup.find('table', attrs={'rules':'all'})
rows = overview.find_all('tr')
content = []
for row in rows:
row_content = []
elements = row.find_all('td')
for i in range(len(elements)):
element = elements[i]
raw = element.text.strip()
clean_string = clean(raw, i)
row_content.append(clean_string)
if not all('' == s or s.isspace() for s in row_content):
content.append(row_content)
headers = content.pop(0)
remove_duplicates(content)
df = pandas.DataFrame(content, columns=headers)
os.chdir("../csv")
df.to_csv(file_name.split('.')[0] + '.csv', index=False, encoding='ANSI')
os.chdir("../xls")