def _setup(self, query, host='http://localhost', user='******', password=None, port=8123): if ' format ' in query.lower(): err_msg = 'Please refrain from adding a "FROAMT" statement to the query' log.error(err_msg) raise Exception(err_msg) query = f'{query} FORMAT JSON' log.info(f'Getting data via the query: "{query}""') params = {'user': user} if password is not None: params['password'] = password response = requests.post(f'{host}:{port}', data=query, params=params) try: data = response.json()['data'] except: log.error( f'Got an invalid response from the database: {response.text}') raise Exception(response.text) df = pd.DataFrame(data) col_map = {} for col in df.columns: col_map[col] = col return df, col_map
def _setup(self, file, clean_header=True, clean_rows=True, custom_parser=None): """ Setup from file :param file: fielpath or url :param clean_header: if you want to clean header column names :param clean_rows: if you want to clean rows for strange null values :param custom_parser: if you want to parse the file with some custom parser """ # get file data io, format and dialect data, format, dialect = self._getDataIo(file) data.seek(0) # make sure we are at 0 in file pointer if format is None: log.error( 'Could not laod file into any format, supported formats are csv, json, xls, xslx' ) if custom_parser: header, file_data = custom_parser(data, format) elif format == 'csv': csv_reader = list(csv.reader(data, dialect)) header = csv_reader[0] file_data = csv_reader[1:] elif format in ['xlsx', 'xls']: data.seek(0) df = pandas.read_excel(data) header = df.columns.values.tolist() file_data = df.values.tolist() elif format == 'json': data.seek(0) json_doc = json.loads(data.read()) df = json_normalize(json_doc) header = df.columns.values.tolist() file_data = df.values.tolist() if clean_header == True: header = self.clean(header) if clean_rows == True: file_list_data = [] for row in file_data: row = self.cleanRow(row) file_list_data.append(row) else: file_list_data = file_data self.setDF(pandas.DataFrame(file_list_data, columns=header))
def getDS(from_data): ''' Get a datasource give the input :param input: a string or an object :return: a datasource ''' if isinstance(from_data, DataSource): from_ds = from_data elif isinstance(from_data, DataFrame): from_ds = DataSource(from_data) else: # assume is a file from_ds = FileDS(from_data) if from_ds is None: log.error('No data matched the input data') return from_ds
def _getDataIo(self, file): """ This gets a file either url or local file and defiens what the format is as well as dialect :param file: file path or url :return: data_io, format, dialect """ ############ # get file as io object ############ data = BytesIO() # get data from either url or file load in memory if file[:5] == 'http:' or file[:6] == 'https:': r = requests.get(file, stream=True) if r.status_code == 200: for chunk in r: data.write(chunk) data.seek(0) # else read file from local file system else: try: data = open(file, 'rb') except Exception as e: error = 'Could not load file, possible exception : {exception}'.format( exception=e) log.error(error) raise ValueError(error) dialect = None ############ # check for file type ############ # try to guess if its an excel file xlsx_sig = b'\x50\x4B\x05\06' xlsx_sig2 = b'\x50\x4B\x03\x04' xls_sig = b'\x09\x08\x10\x00\x00\x06\x05\x00' # differnt whence, offset, size for different types excel_meta = [('xls', 0, 512, 8), ('xlsx', 2, -22, 4)] for filename, whence, offset, size in excel_meta: try: data.seek(offset, whence) # Seek to the offset. bytes = data.read( size) # Capture the specified number of bytes. data.seek(0) codecs.getencoder('hex')(bytes) if bytes == xls_sig: return data, 'xls', dialect elif bytes == xlsx_sig: return data, 'xlsx', dialect except: data.seek(0) # if not excel it can be a json file or a CSV, convert from binary to stringio byte_str = data.read() # Move it to StringIO try: # Handle Microsoft's BOM "special" UTF-8 encoding if byte_str.startswith(codecs.BOM_UTF8): data = StringIO(byte_str.decode('utf-8-sig')) else: data = StringIO(byte_str.decode('utf-8')) except: log.error(traceback.format_exc()) log.error('Could not load into string') # see if its JSON buffer = data.read(100) data.seek(0) text = buffer.strip() # analyze first n characters if len(text) > 0: text = text.strip() # it it looks like a json, then try to parse it if text != "" and ((text[0] == "{") or (text[0] == "[")): try: json.loads(data.read()) data.seek(0) return data, 'json', dialect except: data.seek(0) return data, None, dialect # lets try to figure out if its a csv try: data.seek(0) first_few_lines = [] i = 0 for line in data: i += 1 first_few_lines.append(line) if i > 0: break accepted_delimiters = [',', '\t'] dialect = csv.Sniffer().sniff(''.join(first_few_lines[0]), delimiters=accepted_delimiters) data.seek(0) # if csv dialect identified then return csv if dialect: return data, 'csv', dialect else: return data, None, dialect except: data.seek(0) log.error('Could not detect format for this file') log.error(traceback.format_exc()) # No file type identified return data, None, dialect