class Processor(): def __init__(self, conf): self.config = conf #set logging configuration logging.basicConfig(format='%(levelname)s:%(asctime)s - %(message)s') self.log = logging.getLogger() self.log.setLevel(logging.getLevelName(self.config.log_level)) self.dbx = DropboxHandler(self.config.access_token, self.config.dropbox_timeout, self.config.dropbox_chunck) #upload file to dropbox #if there is an error when uploading, files would be located in results path def upload_file(self, df, idx): filename = self.config.result_prefix + str(idx) + self.config.result_extension file_from = self.config.result_folder + filename file_to = self.config.dropbox_folder_upload + filename print(file_to) df.to_csv(file_from, index=False) print(file_from) self.log.info('Uploading file: %s', file_from) upload = True try: self.dbx.upload_file(file_from, file_to) except Exception as err: self.log.error('Failed to upload %s\n%s', file_from, err) upload = False if upload: os.remove(file_from) return idx+1 #divide file into chunks and upload to dropbox def save_data(self, df, idx): chunks = get_chunks(self.config.output_size_mb, df) if (chunks==0): idx = self.upload_file(df, idx) else: for chunk in np.array_split(df, chunks): idx = self.upload_file(chunk, idx) return idx #read file and return dataframe def create_dataframe(self, local_path): try: df = pd.read_csv(local_path, header=0, sep = get_delimiter(local_path, self.config.encoding_input), usecols=['asin' , 'manufacturer','invalid'], dtype={'asin':object,'manufacturer':object,'invalid':object}, encoding=self.config.encoding_input) except Exception as err: self.log.warning('Failed to process file:%s\n%s', local_path, err) df = pd.read_excel(local_path, header=0, usecols=['asin' , 'manufacturer','invalid'], dtype={'asin':object,'manufacturer':object,'invalid':object}, encoding=self.config.encoding_input) return df #list all files to process and for each append to df until reach threshold #once the threshold is reach, the file is uploaded and start to create another df def process_data(self): files = self.dbx.list_recursive(self.config.dropbox_folder_download) df = pd.DataFrame(data={}) idx=0 for file in files: matcher = re.compile(self.config.file_regex) file_dir = file[0] filename = file[1] file_path = file_dir + '/' + filename if matcher.match(file[1]): try: local_path = self.config.data_folder + filename self.dbx.download_file(file_path, local_path) df2 = self.create_dataframe(local_path) if (check_chunks(self.config.output_size_mb, df,df2)): idx = self.save_data(df, idx) df = df2 else: df = df.append(df2) os.remove(local_path) except Exception as err: self.log.error('Failed processing file %s\n%s', filename, err) if (df.shape[0]>0): self.log.info('Saving last chunck') idx = self.save_data(df, idx)
def main(argv): config_path = '' import_data = False process_data = False category = '' try: opts, args = getopt.getopt(argv, 'hi:c:p:', ['config=', 'import=', 'process=']) except getopt.GetoptError: info() sys.exit(2) for opt, arg in opts: if opt == '-h': info() sys.exit() elif opt in ('-c', '--config'): config_path = arg elif opt in ('-i', '--import'): import_data = bool(arg) elif opt in ('-p', '--process'): process_data = bool(arg) try: config = Config(config_path) #set logging logging.basicConfig( format= '%(levelname)s:%(asctime)s - %(pathname)s:%(lineno)d: %(message)s') log = logging.getLogger() log.setLevel(logging.getLevelName(config.log_level)) except Exception as ex: print('There has been an error while initializing configuration.\n%s' % (ex)) sys.exit(1) try: dbx = DropboxHandler(config.access_token, config.dropbox_timeout, config.dropbox_chunck) dbx.set_log(log) except Exception as ex: log.error( 'There has been an error while initializing dropbox handler.\n%s' % (ex)) sys.exit(1) try: #import data from Google trends if (import_data): #download tickers file if (not path.exists(config.tickers_folder)): dbx.download_file(config.tickers_path, config.tickers_folder) #download gtrends data gt = GTrends(config.encoding, config.tz, config.gtrends_timeout_connect, config.gtrends_timeout_read, config.retries, config.backoff_factor, config.geo, dbx) gt.set_log(log) download_all = gt.import_data( config.tickers_folder, config.year_from, config.year_until, config.categories, config.data_folder_monthly, config.data_folder_daily, config.data_folder_monthly_dropbox, config.data_folder_daily_dropbox) print('download_all=%s' % str(download_all)) except Exception as ex: log.error('There has been an error while importing data.\n%s' % (ex)) sys.exit(1) try: #process data and upload to dropbox if (process_data): p = Processor(config.prefix, config.output_size_mb, dbx) p.set_log(log) for category in config.categories: category_type = category.split(':') category_name = category_type[0] category_type = category_type[1] if (category_type == 'monthly'): p.TL_data(config.data_folder_monthly_dropbox, config.dropbox_folder_upload_monthly, config.tmp_folder_monthly, config.result_folder_monthly, 'monthly.csv', category_name) else: p.TL_data(config.data_folder_daily_dropbox, config.dropbox_folder_upload_daily, config.tmp_folder_daily, config.result_folder_daily, 'daily.csv', category_name) except Exception as ex: log.error('There has been an error while processing data.\n%s' % (ex)) sys.exit(1) sys.exit(0)