-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
78 lines (65 loc) · 2.89 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 20 14:45:32 2019
@author: mikaelapisanileal
"""
import pandas as pd
from files_manager import FilesManager
import os
class Processor:
def __init__(self, prefix, output_size_mb, dbx):
#initialize dropbox connector
self.dbx = dbx
self.prefix = prefix
self.files_manager = FilesManager(output_size_mb)
def set_log(self, log):
self.log = log
self.files_manager.set_log(log)
def TL_data(self, data_folder_dropbox, dropbox_folder_upload, data_folder,
result_folder, sufix, category):
filenames = self.dbx.list_files(data_folder_dropbox + category)
final_df = pd.DataFrame()
index = 0
dropbox_path = dropbox_folder_upload + category
if (not self.dbx.folder_exists(dropbox_path)):
self.log.info('Create folder ' + dropbox_path)
self.dbx.create_folder(dropbox_path)
#for each file, append until reach threashold
for filename in filenames:
path = data_folder_dropbox + category + '/' + filename
local_path = data_folder + filename
self.dbx.download_file(path, local_path)
self.log.info('Processing file:%s', local_path)
try:
df = pd.read_csv(local_path, dtype={'ticker':str})
if (df.shape[0]==0):
os.remove(local_path)
continue
except pd.io.common.EmptyDataError:
os.remove(local_path)
continue
ticker = df['ticker'][0]
df.rename(index=str, columns={ticker: 'count'}, inplace=True)
if (self.files_manager.check_chunks(final_df,df)):
self.log.info('Uploading chunk:%d', index)
#sort values by date before saving the file
final_df.sort_values(by=['date'], inplace=True, ascending=False)
index = self.files_manager.save_data(result_folder,
self.prefix,
sufix,
dropbox_folder_upload + category + '/',final_df,
index, self.dbx)
final_df = df
else:
final_df = final_df.append(df)
os.remove(local_path)
#save last chunck
if (final_df.shape[0]>0):
self.log.info('Saving last chunck')
final_df.sort_values(by=['date'], inplace=True, ascending=False)
self.files_manager.save_data(result_folder,
self.prefix,
sufix,
dropbox_folder_upload+ category + '/',
final_df, index, self.dbx)