Beispiel #1
0
class Processor():
    def __init__(self, conf):
        self.config = conf
        #set logging configuration
        logging.basicConfig(format='%(levelname)s:%(asctime)s - %(message)s')
        self.log = logging.getLogger()
        self.log.setLevel(logging.getLevelName(self.config.log_level))        
        self.dbx = DropboxHandler(self.config.access_token, self.config.dropbox_timeout, self.config.dropbox_chunck)
  
    #upload file to dropbox
    #if there is an error when uploading, files would be located in results path
    def upload_file(self, df, idx):
        filename = self.config.result_prefix + str(idx) + self.config.result_extension
        file_from = self.config.result_folder + filename
        file_to = self.config.dropbox_folder_upload + filename
        print(file_to)
        df.to_csv(file_from, index=False)
        print(file_from)
        self.log.info('Uploading file: %s', file_from)
        upload = True
        try:
           self.dbx.upload_file(file_from, file_to)
        except Exception as err:
            self.log.error('Failed to upload %s\n%s', file_from, err)
            upload = False
        if upload:
           os.remove(file_from)
        return idx+1
        
    #divide file into chunks and upload to dropbox          
    def save_data(self, df, idx):
        chunks = get_chunks(self.config.output_size_mb, df)
        if (chunks==0):
            idx = self.upload_file(df, idx)
        else:
            for chunk in np.array_split(df, chunks):
                idx = self.upload_file(chunk, idx)
        return idx
    
    #read file and return dataframe        
    def create_dataframe(self, local_path):    
        try:
            df = pd.read_csv(local_path, header=0, 
                             sep = get_delimiter(local_path, self.config.encoding_input), 
                             usecols=['asin'	, 'manufacturer','invalid'],
                             dtype={'asin':object,'manufacturer':object,'invalid':object},
                             encoding=self.config.encoding_input)
        except Exception as err:
            self.log.warning('Failed to process file:%s\n%s', local_path, err)
            df = pd.read_excel(local_path, 
                               header=0,  
                             usecols=['asin'	, 'manufacturer','invalid'],
                             dtype={'asin':object,'manufacturer':object,'invalid':object},
                             encoding=self.config.encoding_input)
        return df

    #list all files to process and for each append to df until reach threshold
    #once the threshold is reach, the file is uploaded and start to create another df
    def process_data(self):
        files = self.dbx.list_recursive(self.config.dropbox_folder_download)
        df = pd.DataFrame(data={})
        idx=0
        for file in files:
            matcher = re.compile(self.config.file_regex)
            file_dir = file[0]
            filename = file[1]
            file_path = file_dir + '/' + filename
            if matcher.match(file[1]):
                try:
                    local_path = self.config.data_folder + filename
                    self.dbx.download_file(file_path, local_path)
                    df2 = self.create_dataframe(local_path)
                    if (check_chunks(self.config.output_size_mb, df,df2)):
                        idx = self.save_data(df, idx)
                        df = df2
                    else:
                        df = df.append(df2)
                    os.remove(local_path)
                except Exception as err:
                    self.log.error('Failed processing file %s\n%s', filename, err)

                
        if (df.shape[0]>0):
            self.log.info('Saving last chunck')
            idx = self.save_data(df, idx)  
            
             
Beispiel #2
0
def main(argv):
    config_path = ''
    import_data = False
    process_data = False
    category = ''
    try:
        opts, args = getopt.getopt(argv, 'hi:c:p:',
                                   ['config=', 'import=', 'process='])
    except getopt.GetoptError:
        info()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            info()
            sys.exit()
        elif opt in ('-c', '--config'):
            config_path = arg
        elif opt in ('-i', '--import'):
            import_data = bool(arg)
        elif opt in ('-p', '--process'):
            process_data = bool(arg)

    try:
        config = Config(config_path)

        #set logging
        logging.basicConfig(
            format=
            '%(levelname)s:%(asctime)s - %(pathname)s:%(lineno)d: %(message)s')
        log = logging.getLogger()
        log.setLevel(logging.getLevelName(config.log_level))
    except Exception as ex:
        print('There has been an error while initializing configuration.\n%s' %
              (ex))
        sys.exit(1)

    try:
        dbx = DropboxHandler(config.access_token, config.dropbox_timeout,
                             config.dropbox_chunck)
        dbx.set_log(log)
    except Exception as ex:
        log.error(
            'There has been an error while initializing dropbox handler.\n%s' %
            (ex))
        sys.exit(1)
    try:
        #import data from Google trends
        if (import_data):
            #download tickers file
            if (not path.exists(config.tickers_folder)):
                dbx.download_file(config.tickers_path, config.tickers_folder)

            #download gtrends data
            gt = GTrends(config.encoding, config.tz,
                         config.gtrends_timeout_connect,
                         config.gtrends_timeout_read, config.retries,
                         config.backoff_factor, config.geo, dbx)
            gt.set_log(log)
            download_all = gt.import_data(
                config.tickers_folder, config.year_from, config.year_until,
                config.categories, config.data_folder_monthly,
                config.data_folder_daily, config.data_folder_monthly_dropbox,
                config.data_folder_daily_dropbox)
            print('download_all=%s' % str(download_all))
    except Exception as ex:
        log.error('There has been an error while importing data.\n%s' % (ex))
        sys.exit(1)

    try:
        #process data and upload to dropbox
        if (process_data):
            p = Processor(config.prefix, config.output_size_mb, dbx)
            p.set_log(log)
            for category in config.categories:
                category_type = category.split(':')
                category_name = category_type[0]
                category_type = category_type[1]
                if (category_type == 'monthly'):
                    p.TL_data(config.data_folder_monthly_dropbox,
                              config.dropbox_folder_upload_monthly,
                              config.tmp_folder_monthly,
                              config.result_folder_monthly, 'monthly.csv',
                              category_name)
                else:
                    p.TL_data(config.data_folder_daily_dropbox,
                              config.dropbox_folder_upload_daily,
                              config.tmp_folder_daily,
                              config.result_folder_daily, 'daily.csv',
                              category_name)

    except Exception as ex:
        log.error('There has been an error while processing data.\n%s' % (ex))
        sys.exit(1)

    sys.exit(0)