def move_tables_to_memory(data, source): """Create a petl.MemorySource and copy data from the given tables into it. Can greatly decrease processing time for some operations.""" for s in source: source_table = data.get(s) a = petl.MemorySource() source_table.tocsv(a) b = petl.MemorySource(a.getvalue()) destination_table = petl.fromcsv(source=b) data.set(s, destination_table)
def download(self, path=''): logging.info('Start download from google drive') logging.info(path) report_filename = self.get_report_filename(self.report, self.from_date, self.to_date) if cache: cache_key = path cache_timeout = CONNECTOR_INFO.get('report_cache_timeout', 60 * 60) z_report = cache.get(cache_key) if z_report is not None: return petl.io.fromjson( petl.MemorySource(zlib.decompress(z_report))) logging.info('Download Report from {}'.format(path)) storage = self.storage[0] storage.init() fname = '{}.json'.format(self.report) with storage.open(path) as archive_file: with zipfile.ZipFile(archive_file) as zip_file: # logging.info(fname) report = zip_file.read(fname) z_report = zlib.compress(report) cache.set(cache_key, z_report, timeout=cache_timeout) return petl.io.fromjson(petl.MemorySource(report)) else: # move to init if not os.path.exists(self.report_folder): os.makedirs(self.report_folder) if not os.path.exists(report_filename): logging.info('Download Report from {}'.format(path)) storage = self.storage[0] storage.init() fname = '{}.json'.format(self.report) with storage.open(path) as archive_file: with zipfile.ZipFile(archive_file) as zip_file: # logging.info(fname) report = zip_file.read(fname) with open(report_filename, 'wb') as f: f.write(report) logging.info('Read from {}'.format(report_filename)) report = petl.io.fromjson(report_filename) return report return []
def _persist_data(self, data: etl.Table) -> None: sink = etl.MemorySource() data.tocsv(sink) collection = Collection.objects.create() collection.csv_file.save(f"{str(uuid.uuid4())}.csv", ContentFile(sink.getvalue()))
def display(self, collection: Collection, limit: int) -> etl.Table: if not collection.csv_file.name: return etl.Table() with collection.csv_file.open() as f: source = etl.MemorySource(b"".join(islice(f, 1, limit + 1))) table = etl.fromcsv(source) return table
def count_values(self, collection: Collection, columns: List[str], save_as: str) -> etl.Table: if not collection.csv_file.name: return etl.Table() with collection.csv_file.open() as f: source = etl.MemorySource(f.read()) table = etl.fromcsv(source) return table.aggregate(columns, {save_as: len})
def fromcsvwithheader(source, **kwargs): """ Call `petl.fromcsv` and automatically parse metadata header if present. """ # preliminary open for inspection rawtable = petl.fromcsv(source, header=None, **kwargs) metadata, header, data = parse_metadata_header(rawtable) # transfer data to a in-memory data buffer databuffer = petl.MemorySource() petl.tocsv(data, databuffer, write_header=True, encoding='utf-8') databuffer.s = databuffer.getvalue() # re-open with right headers and add metadata table = petl.fromcsv(databuffer, header=header) table.metadata = metadata return table
def read_raw(self, file): # return etl.fromcsv(file.name) in_memory_file = StringIO() csv_writer = csv.writer(in_memory_file) with warnings.catch_warnings(): warnings.simplefilter("ignore") wb = openpyxl.load_workbook(file.name) sh = wb.worksheets[0] for r in sh.rows: csv_writer.writerow([cell.value for cell in r]) in_memory_file.seek(0) rawlines = in_memory_file.read().encode() raw = etl.MemorySource(rawlines) rdr = etl.fromcsv(raw) return rdr
def etl_data_from_teragon(url, data, tranpose, indexed): """handles making request to the teragon service and transform the response Arguments: url {str} -- Teragon API endpoint data {dict} -- request payload (always sent as data via POST) tranpose {bool} -- transpose the resulting table (default: False) Returns: {dict} -- Teragon API response transformed into a nested dictionary, ready to be transmitted as JSON """ # get the data start_time = timeit.default_timer() response = requests.post(url, data=data) elapsed = timeit.default_timer() - start_time print("response received in {0} seconds".format(elapsed)) # post-process and return the response start_time = timeit.default_timer() table = etl.MemorySource(response.text.encode()) result = transform_teragon_csv(table, tranpose, indexed) elapsed = timeit.default_timer() - start_time print("data processed received in {0} seconds".format(elapsed)) return result
def _convert_to_html(table): table_output = petl.MemorySource() table.tohtml(table_output) return table_output.getvalue().decode('utf-8')
def to_csv(data: Table) -> bytes: output = etl.MemorySource() etl.tocsv(data, output) return output.getvalue()
def time_report(request): """ The main view for the Time Report. Using the Auth Token retrieves the necessary data and displays to the user as a table. Data can be downloaded to a CSV file. """ data = None if request.method == 'POST': form = TimeReportForm(request.POST) if form.is_valid(): form_data = form.cleaned_data client = HubstaffApiClient(form_data["app_token"], form_data["auth_token"]) try: users_t = (petl.fromdicts(client.list_users()) .cut("id", "name").rename("name", "user")) projects_t = (petl.fromdicts(client.list_projects()) .cut("id", "name").rename("name", "project")) main_t = petl.fromdicts(client.list_activities_for_date(form_data["for_date"])) except HTTPError as e: form.add_error(None, "The Hubstaff responded with an error: {} {}".format( e.response.status_code, e.response.reason) ) if main_t.nrows() > 0: # Now some PETL processing, remove unnecessary fields, join with 'users' and # 'projects' tables, aggregate on time spend and pivot, apply formatting to time data = (main_t .cut("user_id", "project_id", "tracked") .join(users_t, lkey="user_id", rkey="id") .join(projects_t, lkey="project_id", rkey="id") .aggregate(("user", "project"), sum, "tracked") .convert("value", convert_seconds_to_iso if form_data["format"] == "csv" else convert_seconds_to_human ) .pivot("project", "user", "value", lambda a: a[0]) ) if request.POST.get("action") == "Download": filename = "activities_report_{}.{}".format( form_data["for_date"].strftime("%Y%m%d"), form_data["format"] ) resp = HttpResponse(content_type='text/{}'.format( form_data["format"] if form_data["format"] != "txt" else "plain" )) resp['Content-Disposition'] = 'attachment; filename="{}"'.format(filename) if form_data["format"] == "html": buf = petl.MemorySource() resp.write("<!DOCTYPE html><html><body>") resp.write("<style type='text/css'>table,th,td {border: solid}</style>") data.tohtml(buf, encoding="utf-8") resp.write(buf.getvalue().decode()) resp.write("</body></html>") elif form_data["format"] == "csv": buf = petl.MemorySource() data.tocsv(buf, encoding="utf-8") resp.write(buf.getvalue().decode()) elif form_data["format"] == "txt": resp.write(data.lookall()) return resp else: buf = petl.MemorySource() data.tohtml(buf, encoding="utf-8") data = buf.getvalue().decode() else: form = TimeReportForm(request.GET) if not form.data.get("app_token") or not form.data.get("auth_token") or not form.data.get( "name") or not form.data.get("id") or not form.data.get("for_date"): return HttpResponseRedirect(urls.reverse('index')) context = {"form": form, "table": data, "submitted": request.method == 'POST'} return render(request, 'time_report.html', context)
"pixels": pixels } # print(payload) url = 'http://web.3riverswetweather.org/trp:API.pixel' # request data from Teragon API start_time = timeit.default_timer() response = requests.get(url, data=payload) elapsed = timeit.default_timer() - start_time print("response received in {0} seconds".format(elapsed)) # read and transform Teragon response start_time = timeit.default_timer() # convert text of response to bytes and read through memory source = etl.MemorySource(response.text.encode()) # read from memory as if it were a csv file on disk # table1 = etl.fromcsv(source) # transform table into dict, ready for conversion to structured JSON table1_json = transform_teragon_csv(source) # print(json.dumps(table1_json, indent=2)) print("processing table took {0} seconds".format( timeit.default_timer() - start_time)) # Transpose (can take a really long time with PETL!) # start_time = timeit.default_timer() # table2 = etl.transpose(t2_rows) # print("{0} rows after transpose eval".format(etl.nrows(table2))) # print("transposing took {0} seconds".format( # timeit.default_timer() - start_time)) # # print(etl.lookall(table2, style='minimal'))
def download(self, path=''): logging.info('Start download from google drive') logging.info(path) report_filename = self.get_report_filename(self.report, self.from_date, self.to_date) time_range = {'since': self.from_date, 'until': self.to_date} if cache: cache_key = path cache_timeout = CONNECTOR_INFO.get('report_cache_timeout', 60 * 60) z_report = cache.get(cache_key) if z_report is not None: return petl.io.fromjson( petl.MemorySource(zlib.decompress(z_report))) ############################### logging.info('Download Report from {}'.format(path)) request = fbAdAccount.get_insights( # pending=True, async=True, fields=fb_ads_insight_fields, params={ 'time_increment': 1, # 'limit': 1, 'level': fbAdsInsights.Level.ad, # 'breakdowns': report., 'time_range': time_range }) # storage = self.storage[0] # storage.init() # # fname = '{}.json'.format(self.report) # # with storage.open(path) as archive_file: # with zipfile.ZipFile(archive_file) as zip_file: # # logging.info(fname) # report = zip_file.read(fname) # z_report = zlib.compress(report) # cache.set(cache_key, z_report, timeout=cache_timeout) # # return petl.io.fromjson(petl.MemorySource(report)) # else: # # move to init # if not os.path.exists(self.report_folder): # os.makedirs(self.report_folder) # # if not os.path.exists(report_filename): # logging.info('Download Report from {}'.format(path)) # storage = self.storage[0] # storage.init() # # fname = '{}.json'.format(self.report) # # with storage.open(path) as archive_file: # with zipfile.ZipFile(archive_file) as zip_file: # # logging.info(fname) # report = zip_file.read(fname) # # with open(report_filename, 'wb') as f: # f.write(report) # # logging.info( # 'Read from {}'.format(report_filename)) # report = petl.io.fromjson(report_filename) # return report return []
for p in pop_colors: h = '#%02x%02x%02x' % tuple(int(255 * c) for c in pop_colors[p]) # chromatin _data_chromatin = b"""CHX chro X 20009764 24393108 CH2R chro 2R 58984778 61545105 CH2L chro 2L 1 2431617 PEU2L chro 2L 2487770 5042389 IH2L chro 2L 5078962 5788875 IH3R chro 3R 38988757 41860198 CH3R chro 3R 52161877 53200684 CH3L chro 3L 1 1815119 PEU3L chro 3L 1896830 4235209 IH3L chro 3L 4264713 5031692 """ tbl_chromatin = (etl.fromtext(etl.MemorySource(_data_chromatin)).split( 'lines', '\s+', ['name', 'type', 'chrom', 'start', 'stop']).convert( ('start', 'stop'), int).cutout('type')) # genome regions region_X_speciation = 'X-speciation', 'X', 15000000, 24000000 region_X_free = 'X-free', 'X', 1, 14000000 region_3L_free = '3L-free', '3L', 15000000, 41000000 region_3R_free = '3R-free', '3R', 1, 37000000 # noinspection PyGlobalUndefined def init(release_dir, load_geneset=False): """Initialise data resources. Parameters
def dof9_import_dates( data, input_file, teacher_absences_destination, period_replacements_destination, emergency_teacher_availables_destination, emergency_teacher_yard_duty_available_destination, yard_duty_replacements_destination, room_replacements_destination, room_edits_destination, lesson_cancellations_destination, yard_duty_cancellations_destination, load_adjustments_destination, room_availables_destination, attendence_changes_destination): """Import the dates and definitions for the replacements that need to happen.""" namespaces = {'x': 'http://www.timetabling.com.au/DOV9'} all_teacher_absences = petl.wrap([[]]) all_period_replacements = petl.wrap([[]]) all_emergency_teacher_availables = petl.wrap([[]]) all_emergency_teacher_yard_duty_available = petl.wrap([[]]) all_yard_duty_replacements = petl.wrap([[]]) all_room_replacements = petl.wrap([[]]) all_room_edits = petl.wrap([[]]) all_lesson_cancellations = petl.wrap([[]]) all_yard_duty_cancellations = petl.wrap([[]]) all_load_adjustments = petl.wrap([[]]) all_room_availables = petl.wrap([[]]) all_attendence_changes = petl.wrap([[]]) tree = lxml.etree.parse(open(input_file)) # Read Everything date_subtrees = tree.xpath( '/x:DailyOrganiserData/x:Dates/x:Date', namespaces={'x': 'http://www.timetabling.com.au/DOV9'}) for date_subtree in date_subtrees: cf_date = date_subtree.findtext('x:Date', namespaces=namespaces) cf_datestring = date_subtree.findtext('x:DateString', namespaces=namespaces) cf_day = date_subtree.findtext('x:Day', namespaces=namespaces) add_common_fields = lambda i, cf_date=cf_date, cf_datestring=cf_datestring, cf_day=cf_day: i.addfield( 'Date', cf_date).addfield('DateString', cf_datestring).addfield( 'Day', cf_day) # noqa subtree_data = petl.MemorySource(lxml.etree.tostring(date_subtree)) if teacher_absences_destination is not None: teacher_absences = petl.fromxml( subtree_data, '{http://www.timetabling.com.au/DOV9}TeacherAbsences/{http://www.timetabling.com.au/DOV9}TeacherAbsence', { 'TeacherAbsenceID': '{http://www.timetabling.com.au/DOV9}TeacherAbsenceID', 'TeacherCode': '{http://www.timetabling.com.au/DOV9}TeacherCode', 'SessionNo': '{http://www.timetabling.com.au/DOV9}SessionNo', 'Precedes': '{http://www.timetabling.com.au/DOV9}Precedes', 'IsYardDuty': '{http://www.timetabling.com.au/DOV9}IsYardDuty', 'PeriodCode': '{http://www.timetabling.com.au/DOV9}PeriodCode', 'TeacherAbsenceReasonID': '{http://www.timetabling.com.au/DOV9}TeacherAbsenceID', 'Counted': '{http://www.timetabling.com.au/DOV9}Counted', 'Load': '{http://www.timetabling.com.au/DOV9}Load', 'ArchiveTimetableReference': '{http://www.timetabling.com.au/DOV9}ArchiveTimetableReference', 'ArchiveErrorType': '{http://www.timetabling.com.au/DOV9}ArchiveErrorType' }) teacher_absences = add_common_fields(teacher_absences) all_teacher_absences = all_teacher_absences.cat(teacher_absences) if period_replacements_destination is not None: period_replacements = petl.fromxml( subtree_data, '{http://www.timetabling.com.au/DOV9}PeriodReplacements/{http://www.timetabling.com.au/DOV9}PeriodReplacement', { 'PeriodReplacementID': '{http://www.timetabling.com.au/DOV9}PeriodReplacementID', 'RollClassCode': '{http://www.timetabling.com.au/DOV9}RollClassCode', 'ClassCode': '{http://www.timetabling.com.au/DOV9}ClassCode', 'ClassGroupRowID': '{http://www.timetabling.com.au/DOV9}ClassGroupRowID', 'PeriodCode': '{http://www.timetabling.com.au/DOV9}PeriodCode', 'PeriodNo': '{http://www.timetabling.com.au/DOV9}PeriodNo', 'ReplacementTeacherCode': '{http://www.timetabling.com.au/DOV9}ReplacementTeacherCode', 'Load': '{http://www.timetabling.com.au/DOV9}Load', 'Count': '{http://www.timetabling.com.au/DOV9}Count', 'InLieu': '{http://www.timetabling.com.au/DOV9}InLieu', 'Notes': '{http://www.timetabling.com.au/DOV9}Notes', 'Index': '{http://www.timetabling.com.au/DOV9}Index', 'NotRequired': '{http://www.timetabling.com.au/DOV9}NotRequired', 'DuplicateReplacementID': '{http://www.timetabling.com.au/DOV9}DuplicateReplacementID', 'ReferenceTeacherCode': '{http://www.timetabling.com.au/DOV9}ReferenceTeacherCode', 'IsComposites': '{http://www.timetabling.com.au/DOV9}IsComposites', 'ArchiveTimetableReference': '{http://www.timetabling.com.au/DOV9}ArchiveTimetableReference', 'ArchiveErrorType': '{http://www.timetabling.com.au/DOV9}ArchiveErrorType' }) period_replacements = add_common_fields(period_replacements) all_period_replacements = all_period_replacements.cat( period_replacements) if yard_duty_replacements_destination is not None: yard_duty_replacements = petl.fromxml( subtree_data, '{http://www.timetabling.com.au/DOV9}YardDutyReplacements/{http://www.timetabling.com.au/DOV9}YardDutyReplacement', { 'YardDutyReplacementID': '{http://www.timetabling.com.au/DOV9}YardDutyReplacementID', 'YardDutyCode': '{http://www.timetabling.com.au/DOV9}YardDutyCode', 'PeriodCode': '{http://www.timetabling.com.au/DOV9}PeriodCode', 'PeriodNo': '{http://www.timetabling.com.au/DOV9}PeriodNo', 'Precedes': '{http://www.timetabling.com.au/DOV9}Precedes', 'SessionNo': '{http://www.timetabling.com.au/DOV9}SessionNo', 'ReplacementTeacherCode': '{http://www.timetabling.com.au/DOV9}ReplacementTeacherCode', 'Load': '{http://www.timetabling.com.au/DOV9}Load', 'Count': '{http://www.timetabling.com.au/DOV9}Count', 'InLieu': '{http://www.timetabling.com.au/DOV9}InLieu', 'Notes': '{http://www.timetabling.com.au/DOV9}Notes', 'Index': '{http://www.timetabling.com.au/DOV9}Index', 'NotRequired': '{http://www.timetabling.com.au/DOV9}NotRequired', 'ActivityCode': '{http://www.timetabling.com.au/DOV9}ActivityCode', 'ReferenceTeacherCode': '{http://www.timetabling.com.au/DOV9}ReferenceTeacherCode', 'DuplicateReplacementID': '{http://www.timetabling.com.au/DOV9}DuplicateReplacementID', 'ArchiveTimetableReference': '{http://www.timetabling.com.au/DOV9}ArchiveTimetableReference', 'ArchiveErrorType': '{http://www.timetabling.com.au/DOV9}ArchiveErrorType' }) yard_duty_replacements = add_common_fields(yard_duty_replacements) all_yard_duty_replacements = all_yard_duty_replacements.cat( yard_duty_replacements) if room_edits_destination is not None: room_edits = petl.fromxml( subtree_data, '{http://www.timetabling.com.au/DOV9}RoomEdits/{http://www.timetabling.com.au/DOV9}RoomEdit', { 'ClassCode': '{http://www.timetabling.com.au/DOV9}ClassCode', 'ClassGroupRowID': '{http://www.timetabling.com.au/DOV9}ClassGroupRowID', 'RollClassCode': '{http://www.timetabling.com.au/DOV9}RollClassCode', 'PeriodCode': '{http://www.timetabling.com.au/DOV9}PeriodCode', 'ReplacementRoomCode': '{http://www.timetabling.com.au/DOV9}ReplacementRoomCode', 'ArchiveTimetableReference': '{http://www.timetabling.com.au/DOV9}ArchiveTimetableReference', 'ArchiveErrorType': '{http://www.timetabling.com.au/DOV9}ArchiveErrorType' }) room_edits = add_common_fields(room_edits) all_room_edits = all_room_edits.cat(room_edits) if teacher_absences_destination is not None: data.set(teacher_absences_destination, all_teacher_absences) if period_replacements_destination is not None: data.set(period_replacements_destination, all_period_replacements) if yard_duty_replacements_destination is not None: data.set(yard_duty_replacements_destination, all_yard_duty_replacements) if room_edits_destination is not None: data.set(room_edits_destination, all_room_edits)
def download(self, urls=[]): # timeout setting for requests # timeout = urllib3.Timeout(connect=2.0, read=7.0) # http = urllib3.PoolManager(timeout=timeout) http = urllib3.PoolManager() report_data = [] for url in urls: # print(url) report_filename = self.get_report_filename( hashlib.md5(url).hexdigest()) if cache: # print('use cache') cache_key = url cache_timeout = CONNECTOR_INFO.get('report_cache_timeout', 60 * 60) z_report = cache.get(cache_key) if z_report is not None: new_report_data = petl.io.fromcsv( petl.MemorySource(zlib.decompress(z_report))) # print(len(new_report_data)) if not report_data: # print('NEw cat') report_data = new_report_data else: report_data = petl.cat(report_data, new_report_data) continue logging.info('Download Report from {}'.format(url)) r = http.request('GET', url, retries=urllib3.Retry( redirect=2, backoff_factor=2, )) if r.status == 200: report = r.data r.release_conn() z_report = zlib.compress(report) cache.set(cache_key, z_report, timeout=cache_timeout) # return petl.io.fromcsv(petl.MemorySource(report)) new_report_data = petl.io.fromcsv( petl.MemorySource(report)) # print(len(new_report_data)) if not report_data: report_data = new_report_data else: report_data = petl.cat(report_data, new_report_data) elif r.status == 403: raise Exception(r.data) else: logging.info(r.data) logging.info(r.status) logging.info(r.headers) else: # move to init # print('Not cache') if not os.path.exists(self.report_folder): os.makedirs(self.report_folder) if not os.path.exists(report_filename): logging.info('Download Report from {}'.format(url)) r = http.request('GET', url, retries=urllib3.Retry( redirect=2, backoff_factor=2, )) if r.status == 200: with open(report_filename, 'wb') as f: f.write(r.data) r.release_conn() logging.info('Read from {}'.format(report_filename)) new_report_data = petl.io.fromcsv(report_filename) if not report_data: report_data = new_report_data else: report_data = petl.cat(report_data, new_report_data) return report_data