Exemple #1
0
def gosreestr_parse_new_uids(fpath,
                             existed_uids,
                             timeout,
                             error_timeout,
                             luigi_callback=None):
    page_index = 0
    s = requests.Session()
    headers = Headers(headers=True)

    _existed_uids = existed_uids

    if os.path.exists(fpath):
        parsed_uids = [u.split(';')[0] for u in read_lines(fpath)]
        page_index = int(read_lines(fpath).pop().split(';')[1]) + 1
        _existed_uids.extend(parsed_uids)

    form_data = prepare_request_data(FORM_DATA, page_index)
    s.headers = headers.generate()
    table_raw = s.post(LIST_URL, data=form_data, timeout=15).text
    status = ''
    new_uids_count = 0
    new_uids = list()
    while not check_empty_table(table_raw):
        uids = parse_ids_from_table(table_raw)
        _new_uids = list()
        for uid in uids:
            if uid not in _existed_uids:
                _new_uids.append(uid)
                append_file(fpath, f'{uid};{page_index}')
            else:
                break

        new_uids.extend(_new_uids)
        new_uids_count += len(_new_uids)

        form_data = prepare_request_data(FORM_DATA, page_index)

        try:
            s.headers = headers.generate()
            table_raw = s.post(LIST_URL, data=form_data, timeout=15).text
        except (ReadTimeout, ConnectTimeout, ConnectionError,
                ReadTimeoutError):
            luigi_callback(
                f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after error',
                0)
            sleep(error_timeout)
        else:
            page_index += 1
            luigi_callback(
                f'Page: {page_index}, parsed count: {new_uids_count}. Timeout after success.',
                0)
            sleep(timeout)

    return new_uids
Exemple #2
0
    def get_parsed_chunks(self):
        # get parsed chunks from prs file
        parsed_chunks = []
        if self.parsed_fpath and os.path.exists(self.parsed_fpath):
            parsed_chunks = read_lines(self.parsed_fpath)

        return [int(chunk) for chunk in parsed_chunks]
Exemple #3
0
    def __init__(self, bids_fpath, output_fpath, parsed_fpath):
        self.failed_bids = deque([])
        self.output_fpath = output_fpath

        parsed_bids = []
        if exists(parsed_fpath):
            parsed_bids = read_lines(parsed_fpath)

        self._parsed_bids_count = len(parsed_bids)

        source_bids = [bid for bid in read_lines(bids_fpath) if check_id(bid)]
        self._source_bids_count = len(source_bids)

        # excluding parsed bids
        if parsed_bids:
            s = set(source_bids)
            s.difference_update(set(parsed_bids))
            self._bids = deque(list(s))
        else:
            self._bids = deque(source_bids)
Exemple #4
0
    def run(self):
        bids_fpath = build_fpath(self.directory, self.name, 'uids')

        # copy on local machine from ftp
        if not exists(bids_fpath):
            self.input().get(bids_fpath)

        uids = read_lines(bids_fpath)
        new_uids = gosreestr_parse_new_uids(self.output().path,
                                            uids,
                                            timeout=self.timeout,
                                            error_timeout=self.timeout_error,
                                            luigi_callback=self.set_status)
        append_file(self.success_fpath, len(new_uids))
Exemple #5
0
    def run(self):
        error_timeout = self.timeout * 3
        headers = dict()
        headers['Authorization'] = self.token

        url = f'{self.url}?limit={self.limit}'
        host = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))

        # we store parsed blocks of data as uris
        # in case reruning we parse last uri
        if os.path.exists(self.parsed_fpath):
            uri = read_lines(self.parsed_fpath).pop()
            url = f'{host}{uri}'

        total = 0
        parsed_count = get_file_lines_count(self.output().path)
        parsed_count = 0 if not parsed_count else parsed_count

        while url:
            try:
                r = get(url, headers=headers, timeout=self.timeout)
            except Exception:
                sleep(error_timeout)
            else:
                response = Box(json.loads(r))
                if response.next_page:
                    url = f'{self.url}?{response.next_page}'
                    append_file(self.parsed_fpath, response.next_page)
                else:
                    url = None

                total = response.total
                raw_items = list(response['items'])
                # data = dict_to_csvrow(raw_items, self.struct)
                data = [dict_to_csvrow(d, self.struct) for d in raw_items]
                save_csvrows(self.output().path, data, quoter="\"")
                parsed_count += self.limit
                sleep(self.timeout)

            self.set_status_message(f'Total: {total}. Parsed: {parsed_count}')
            self.set_progress_percentage(round((parsed_count * 100) / total))

        stat = dict(total=total, parsed=parsed_count)
        append_file(self.success_fpath, str(stat))
 def prs_ids(self):
     return read_lines(self.parsed_file)
 def src_ids(self):
     return read_lines(self._ids_fpath)
Exemple #8
0
 def run(self):
     # read url from file
     url = read_lines(self.input().path)[0]
     apath = os.path.join(TMP_DIR, f'{self.name}.zip')
     frmt = save_webfile(url, apath)
     unzip_one_file(apath, self.name)
Exemple #9
0
def parse_dgovbig(rep,
                  struct,
                  apikey,
                  output_fpath,
                  parsed_fpath,
                  updates_date=None,
                  version=None,
                  query=None,
                  callback=None):

    # retriev total count
    total = load_total(build_url_for_detail_page(rep, apikey, version, query))

    # get parsed chunks from prs file
    parsed_chunks = []
    if os.path.exists(parsed_fpath):
        parsed_chunks = read_lines(parsed_fpath)

    is_retrying = False
    parsed_chunks_count = 0
    if parsed_chunks:
        parsed_chunks_count = len(parsed_chunks)
        is_retrying = True

    # build chunks considering already parsed chunks
    chunks, total_chunks, parsed_count = prepare_chunks(total, parsed_chunks)

    errors = 0

    # it's convinient having deque of chunks,
    # cause we can do retry, putting aside failed chunk for later
    chunks = deque(chunks)
    while chunks:
        _ch = chunks.popleft()
        chunk = Chunk(*(_ch.split(':')))
        query = '{' + QUERY_TMPL.format(chunk.start, chunk.size) + '}'

        url = build_url_for_data_page(rep,
                                      apikey,
                                      version=version,
                                      query=query)
        print(url)
        try:
            data = load2(url, struct, updates_date=updates_date)
        except (HTTPError, ConnectionError, Timeout, RetryError,
                ReadTimeout) as exc:
            chunks.append(_ch)
            sleep(TIMEOUT * 2)
            errors += 1
        else:
            _chunk = Chunk(chunk.start, chunk.size, len(data))
            parsed_count += _chunk.count
            parsed_chunks_count += 1
            save_csvrows(output_fpath, data)
            append_file(parsed_fpath, ':'.join((str(ch) for ch in _chunk)))
            sleep(TIMEOUT)
        if callback:
            s, p = prepare_callback_info(total, total_chunks, parsed_count,
                                         errors, parsed_chunks_count,
                                         updates_date, is_retrying)
            callback(s, p)

    # if we have not parsed all chunks
    # we shoud do retry after several time
    if total_chunks != parsed_chunks_count:
        raise ExternalSourceError("Could not parse all chunks. Try again.")

    stata = dict(total=total, parsed_count=parsed_count)
    append_file(success_fpath(output_fpath), json.dumps(stata))
    return parsed_count