def gosreestr_parse_new_uids(fpath, existed_uids, timeout, error_timeout, luigi_callback=None): page_index = 0 s = requests.Session() headers = Headers(headers=True) _existed_uids = existed_uids if os.path.exists(fpath): parsed_uids = [u.split(';')[0] for u in read_lines(fpath)] page_index = int(read_lines(fpath).pop().split(';')[1]) + 1 _existed_uids.extend(parsed_uids) form_data = prepare_request_data(FORM_DATA, page_index) s.headers = headers.generate() table_raw = s.post(LIST_URL, data=form_data, timeout=15).text status = '' new_uids_count = 0 new_uids = list() while not check_empty_table(table_raw): uids = parse_ids_from_table(table_raw) _new_uids = list() for uid in uids: if uid not in _existed_uids: _new_uids.append(uid) append_file(fpath, f'{uid};{page_index}') else: break new_uids.extend(_new_uids) new_uids_count += len(_new_uids) form_data = prepare_request_data(FORM_DATA, page_index) try: s.headers = headers.generate() table_raw = s.post(LIST_URL, data=form_data, timeout=15).text except (ReadTimeout, ConnectTimeout, ConnectionError, ReadTimeoutError): luigi_callback( f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after error', 0) sleep(error_timeout) else: page_index += 1 luigi_callback( f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after success.', 0) sleep(timeout) return new_uids
def get_parsed_chunks(self): # get parsed chunks from prs file parsed_chunks = [] if self.parsed_fpath and os.path.exists(self.parsed_fpath): parsed_chunks = read_lines(self.parsed_fpath) return [int(chunk) for chunk in parsed_chunks]
def __init__(self, bids_fpath, output_fpath, parsed_fpath): self.failed_bids = deque([]) self.output_fpath = output_fpath parsed_bids = [] if exists(parsed_fpath): parsed_bids = read_lines(parsed_fpath) self._parsed_bids_count = len(parsed_bids) source_bids = [bid for bid in read_lines(bids_fpath) if check_id(bid)] self._source_bids_count = len(source_bids) # excluding parsed bids if parsed_bids: s = set(source_bids) s.difference_update(set(parsed_bids)) self._bids = deque(list(s)) else: self._bids = deque(source_bids)
def run(self): bids_fpath = build_fpath(self.directory, self.name, 'uids') # copy on local machine from ftp if not exists(bids_fpath): self.input().get(bids_fpath) uids = read_lines(bids_fpath) new_uids = gosreestr_parse_new_uids(self.output().path, uids, timeout=self.timeout, error_timeout=self.timeout_error, luigi_callback=self.set_status) append_file(self.success_fpath, len(new_uids))
def run(self): error_timeout = self.timeout * 3 headers = dict() headers['Authorization'] = self.token url = f'{self.url}?limit={self.limit}' host = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url)) # we store parsed blocks of data as uris # in case reruning we parse last uri if os.path.exists(self.parsed_fpath): uri = read_lines(self.parsed_fpath).pop() url = f'{host}{uri}' total = 0 parsed_count = get_file_lines_count(self.output().path) parsed_count = 0 if not parsed_count else parsed_count while url: try: r = get(url, headers=headers, timeout=self.timeout) except Exception: sleep(error_timeout) else: response = Box(json.loads(r)) if response.next_page: url = f'{self.url}?{response.next_page}' append_file(self.parsed_fpath, response.next_page) else: url = None total = response.total raw_items = list(response['items']) # data = dict_to_csvrow(raw_items, self.struct) data = [dict_to_csvrow(d, self.struct) for d in raw_items] save_csvrows(self.output().path, data, quoter="\"") parsed_count += self.limit sleep(self.timeout) self.set_status_message(f'Total: {total}. Parsed: {parsed_count}') self.set_progress_percentage(round((parsed_count * 100) / total)) stat = dict(total=total, parsed=parsed_count) append_file(self.success_fpath, str(stat))
def prs_ids(self): return read_lines(self.parsed_file)
def src_ids(self): return read_lines(self._ids_fpath)
def run(self): # read url from file url = read_lines(self.input().path)[0] apath = os.path.join(TMP_DIR, f'{self.name}.zip') frmt = save_webfile(url, apath) unzip_one_file(apath, self.name)
def parse_dgovbig(rep, struct, apikey, output_fpath, parsed_fpath, updates_date=None, version=None, query=None, callback=None): # retriev total count total = load_total(build_url_for_detail_page(rep, apikey, version, query)) # get parsed chunks from prs file parsed_chunks = [] if os.path.exists(parsed_fpath): parsed_chunks = read_lines(parsed_fpath) is_retrying = False parsed_chunks_count = 0 if parsed_chunks: parsed_chunks_count = len(parsed_chunks) is_retrying = True # build chunks considering already parsed chunks chunks, total_chunks, parsed_count = prepare_chunks(total, parsed_chunks) errors = 0 # it's convinient having deque of chunks, # cause we can do retry, putting aside failed chunk for later chunks = deque(chunks) while chunks: _ch = chunks.popleft() chunk = Chunk(*(_ch.split(':'))) query = '{' + QUERY_TMPL.format(chunk.start, chunk.size) + '}' url = build_url_for_data_page(rep, apikey, version=version, query=query) print(url) try: data = load2(url, struct, updates_date=updates_date) except (HTTPError, ConnectionError, Timeout, RetryError, ReadTimeout) as exc: chunks.append(_ch) sleep(TIMEOUT * 2) errors += 1 else: _chunk = Chunk(chunk.start, chunk.size, len(data)) parsed_count += _chunk.count parsed_chunks_count += 1 save_csvrows(output_fpath, data) append_file(parsed_fpath, ':'.join((str(ch) for ch in _chunk))) sleep(TIMEOUT) if callback: s, p = prepare_callback_info(total, total_chunks, parsed_count, errors, parsed_chunks_count, updates_date, is_retrying) callback(s, p) # if we have not parsed all chunks # we shoud do retry after several time if total_chunks != parsed_chunks_count: raise ExternalSourceError("Could not parse all chunks. Try again.") stata = dict(total=total, parsed_count=parsed_count) append_file(success_fpath(output_fpath), json.dumps(stata)) return parsed_count