def run(self): client = self.get_client() query = gql(self.query) start_from = None params = { 'from': str(self.start_date), 'to': str(self.end_date), 'limit': self.limit } while True: p = params if start_from: p["after"] = start_from data = client.execute(query, variable_values=p) if data.get('Contract') is None or len(data.get('Contract', [])) == 0: break last_id = data.get('Contract', [])[-1]['id'] start_from = last_id data = [ dict_to_csvrow(d, self.struct) for d in data.get('Contract') ] save_csvrows(self.output().path, data, sep=self.sep, quoter="\"")
def run(self): client = self.get_client() query = gql(self.query) start_from = None params = { 'from': str(self.start_date), 'to': str(self.end_date), 'limit': self.limit } header = tuple(f.name for f in attr.fields(GoszakupCompanyRow)) save_csvrows(self.output().path, [header], sep=self.sep) while True: p = params if start_from: p["after"] = start_from data = client.execute(query, variable_values=p) if data.get('Subjects') is None or len(data.get('Subjects', [])) == 0: break last_id = data.get('Subjects', [])[-1]['pid'] start_from = last_id data = [ dict_to_csvrow(d, self.struct) for d in data.get('Subjects') ] save_csvrows(self.output().path, data, sep=self.sep, quoter="\"")
def run(self): query = '{' + QUERY_TMPL.format(0, self.chunk_size) + '}' rep_url = build_url_for_report_page(self.rep_name) versions = self.versions if not versions: versions = load_versions(rep_url) for vs in versions: url = build_url_for_data_page(self.rep_name, self.api_key, version=vs, query=query) data = load_data(url, self.struct, self.columns_filter) save_csvrows(self.output().path, data)
def run(self): for i, target in enumerate(self.input()): self.set_status_message('Parsing {}'.format(target.path)) rows = parse(target.path, Row, skiprows=self.skiptop, sheets=self.sheets) save_csvrows(self.output().path, [attr.astuple(r) for r in rows]) percent = round((i + 1) * 100 / len(self.input())) self.set_progress_percentage(percent)
def parse_excel_rect_area_to_csv(xl_fpath, csv_fpath, wrapper, sheets=None, skiptopnum=None, usecols=None, transform_callback=None): """ Save records parsed from excel file to csv """ # get list of sheets xl_df = pd.ExcelFile(xl_fpath) xl_sheets = xl_df.sheet_names _sheets = sheets # by default we parse all the sheets if not sheets: _sheets = [i for i, _ in enumerate(range(len(xl_sheets)))] # init skiptoprows # by default we always skip one row from the top _skiptopnums = [1 for x in range(len(_sheets))] # and by now if skiptopnums is given # it will be applied only to first sheet if skiptopnum: _skiptopnums[0] = skiptopnum count = 0 for i, sh in enumerate(_sheets): if sh <= len(xl_sheets) - 1: df = pd.read_excel(xl_fpath, sheet_name=xl_sheets[sh], skiprows=_skiptopnums[i], usecols=usecols, index_col=None, dtype=str, header=None) # convert Excel's empty cells to empty string data = df.replace(np.nan, '', regex=True) data.dropna(inplace=True) rows = [wrapper(*x) for x in data.values] if len(rows) > 0: if transform_callback: transform_callback(rows) save_csvrows(csv_fpath, [attr.astuple(r) for r in rows]) count += len(rows) return count
def run(self): rep_url = build_url_for_report_page(self.rep_name) versions = self.versions if not versions: versions = load_versions(rep_url) for vs in versions: url = build_url_for_data_page(self.rep_name, DGOV_API_KEY, version=vs) data = load_data(url, Row) save_csvrows(self.output().path, data)
def run(self): url = build_url_for_report_page(self.report_name) version = load_versions(url)[-1] parser = DatagovApiParsing(self.api_key, self.report_name, self.struct, self.chunk_size, self.output().path) d = self.date.strftime(FRMT) q = '{"size":%s,"query":{"bool":{"must":[{"match":{"date":"%s"}}]}}}' % ( self.chunk_size, d) data = parser.parse_query_report(version, q) save_csvrows(self.output().path, data)
def parse_chunk(url, struct, output_fpath, updates_for=None, timeout=None, retries=None, backoff_factor=None): data = [] try: data = load(url, struct, updates_for=updates_for, timeout=timeout, retries=retries, backoff_factor=backoff_factor) except Exception: raise else: save_csvrows(output_fpath, data) # sleep(10) return len(data)
def run(self): error_timeout = self.timeout * 3 headers = dict() headers['Authorization'] = self.token url = f'{self.url}?limit={self.limit}' host = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url)) # we store parsed blocks of data as uris # in case reruning we parse last uri if os.path.exists(self.parsed_fpath): uri = read_lines(self.parsed_fpath).pop() url = f'{host}{uri}' total = 0 parsed_count = get_file_lines_count(self.output().path) parsed_count = 0 if not parsed_count else parsed_count while url: try: r = get(url, headers=headers, timeout=self.timeout) except Exception: sleep(error_timeout) else: response = Box(json.loads(r)) if response.next_page: url = f'{self.url}?{response.next_page}' append_file(self.parsed_fpath, response.next_page) else: url = None total = response.total raw_items = list(response['items']) # data = dict_to_csvrow(raw_items, self.struct) data = [dict_to_csvrow(d, self.struct) for d in raw_items] save_csvrows(self.output().path, data, quoter="\"") parsed_count += self.limit sleep(self.timeout) self.set_status_message(f'Total: {total}. Parsed: {parsed_count}') self.set_progress_percentage(round((parsed_count * 100) / total)) stat = dict(total=total, parsed=parsed_count) append_file(self.success_fpath, str(stat))
def parse_report(rep, struct, apikey, output_fpath, parsed_fpath, updates_date=None, version=None, query=None, callback=None): # retriev total count total = load_total(build_url_for_detail_page(rep, apikey, version, query)) # get parsed chunks from prs file parsed_chunks = [] if os.path.exists(parsed_fpath): parsed_chunks = read_lines(parsed_fpath) is_retrying = False parsed_chunks_count = 0 if parsed_chunks: parsed_chunks_count = len(parsed_chunks) is_retrying = True # build chunks considering already parsed chunks chunks, total_chunks, parsed_count = prepare_chunks(total, parsed_chunks) errors = 0 with futures.ThreadPoolExecutor(max_workers=3) as ex: to_do_map = {} for chunk in chunks: _chunk = Chunk(*(chunk.split(':'))) query = query = '{' + QUERY_TMPL.format(_chunk.start, _chunk.size) + '}' url = build_url_for_data_page(rep, apikey, version=version, query=query) future = ex.submit(load2, url, struct, updates_date) to_do_map[future] = chunk done_iter = futures.as_completed(to_do_map) for future in done_iter: try: data = future.result() start, size, _ = to_do_map[future].split(':') except (HTTPError, ConnectionError, Timeout, RetryError, ReadTimeout) as exc: print(exc) errors += 1 sleep(TIMEOUT * 2) else: _chunk = '{}:{}:{}'.format(start, size, len(data)) print(_chunk) parsed_count += len(data) save_csvrows(output_fpath, data) append_file(parsed_fpath, _chunk) sleep(TIMEOUT) if callback: s, p = prepare_callback_info(total, total_chunks, parsed_count, errors, parsed_chunks_count, updates_date, is_retrying) callback(s, p) if total_chunks != parsed_chunks: raise ExternalSourceError("Could not parse all the data. Try again.") stata = dict(total=total, parsed_count=parsed_count) append_file(success_fpath(output_fpath), json.dumps(stata)) return parsed_count
def run(self): for target in self.input(): rows = parse(target.path, Row, skiprows=self.skiptop, usecols=self.usecolumns) save_csvrows(self.output().path, [attr.astuple(r) for r in rows])
def run(self): rows = nb_rates_as_csvrows(self.url) save_csvrows(self.output().path, rows)
def run(self): d = parse_json_from_js(self.url, self.pattern) # wrap each row and get tuple rows = [attr.astuple(Row(**_d)) for _d in d] save_csvrows(self.output().path, rows)
def run(self): rows = parse(self.input().path, Row, skiprows=self.skiptop) save_csvrows(self.output().path, [attr.astuple(r) for r in rows])
def parse_report(self, version, dates_range=None, progress_callback=None): total_rows = self.get_total_rows_for_version(version) all_chunks = get_chunks_start_position(total_rows, self.chunk_size) # parsed_chunks = [] # if self.parsed_fpath: parsed_chunks = self.get_parsed_chunks() is_retrying = False if parsed_chunks: is_retrying = True chunks = deque(prepare_chunks2(all_chunks, parsed_chunks)) errors = 0 parsed_chunks_count = len(parsed_chunks) parsed_rows_count = parsed_chunks_count * self.chunk_size while chunks: chunk = chunks.popleft() query = build_query(chunk, self.chunk_size, dates_range=dates_range) url = build_url_for_data_page(self.report_name, self.apikey, version=version, query=query) print(url) try: data = load3(url, self.struct) except (HTTPError, ConnectionError, Timeout, RetryError, ReadTimeout) as exc: print(exc) chunks.append(chunk) sleep(TIMEOUT * 2) errors += 1 else: if (not data) and dates_range: break parsed_rows_count += len(data) parsed_chunks_count += 1 save_csvrows(self.output_fpath, data) if self.parsed_fpath: append_file(self.parsed_fpath, str(chunk)) sleep(self.timeout) if progress_callback: s, p = self._progress_status_info( version, len(all_chunks), errors, parsed_rows_count, parsed_chunks_count, is_retrying=is_retrying, ) progress_callback(s, p) if not total_rows: raise ExternalSourceError( f'Report {self.report_name}:{version} has no data. ') return total_rows, parsed_rows_count
def parse_dgovbig(rep, struct, apikey, output_fpath, parsed_fpath, updates_date=None, version=None, query=None, callback=None): # retriev total count total = load_total(build_url_for_detail_page(rep, apikey, version, query)) # get parsed chunks from prs file parsed_chunks = [] if os.path.exists(parsed_fpath): parsed_chunks = read_lines(parsed_fpath) is_retrying = False parsed_chunks_count = 0 if parsed_chunks: parsed_chunks_count = len(parsed_chunks) is_retrying = True # build chunks considering already parsed chunks chunks, total_chunks, parsed_count = prepare_chunks(total, parsed_chunks) errors = 0 # it's convinient having deque of chunks, # cause we can do retry, putting aside failed chunk for later chunks = deque(chunks) while chunks: _ch = chunks.popleft() chunk = Chunk(*(_ch.split(':'))) query = '{' + QUERY_TMPL.format(chunk.start, chunk.size) + '}' url = build_url_for_data_page(rep, apikey, version=version, query=query) print(url) try: data = load2(url, struct, updates_date=updates_date) except (HTTPError, ConnectionError, Timeout, RetryError, ReadTimeout) as exc: chunks.append(_ch) sleep(TIMEOUT * 2) errors += 1 else: _chunk = Chunk(chunk.start, chunk.size, len(data)) parsed_count += _chunk.count parsed_chunks_count += 1 save_csvrows(output_fpath, data) append_file(parsed_fpath, ':'.join((str(ch) for ch in _chunk))) sleep(TIMEOUT) if callback: s, p = prepare_callback_info(total, total_chunks, parsed_count, errors, parsed_chunks_count, updates_date, is_retrying) callback(s, p) # if we have not parsed all chunks # we shoud do retry after several time if total_chunks != parsed_chunks_count: raise ExternalSourceError("Could not parse all chunks. Try again.") stata = dict(total=total, parsed_count=parsed_count) append_file(success_fpath(output_fpath), json.dumps(stata)) return parsed_count