Ejemplo n.º 1
0
    def run(self):
        client = self.get_client()
        query = gql(self.query)
        start_from = None
        params = {
            'from': str(self.start_date),
            'to': str(self.end_date),
            'limit': self.limit
        }

        while True:
            p = params
            if start_from:
                p["after"] = start_from

            data = client.execute(query, variable_values=p)
            if data.get('Contract') is None or len(data.get('Contract',
                                                            [])) == 0:
                break

            last_id = data.get('Contract', [])[-1]['id']
            start_from = last_id
            data = [
                dict_to_csvrow(d, self.struct) for d in data.get('Contract')
            ]
            save_csvrows(self.output().path, data, sep=self.sep, quoter="\"")
Ejemplo n.º 2
0
    def run(self):
        client = self.get_client()
        query = gql(self.query)
        start_from = None
        params = {
            'from': str(self.start_date),
            'to': str(self.end_date),
            'limit': self.limit
        }

        header = tuple(f.name for f in attr.fields(GoszakupCompanyRow))
        save_csvrows(self.output().path, [header], sep=self.sep)

        while True:
            p = params
            if start_from:
                p["after"] = start_from

            data = client.execute(query, variable_values=p)
            if data.get('Subjects') is None or len(data.get('Subjects',
                                                            [])) == 0:
                break

            last_id = data.get('Subjects', [])[-1]['pid']
            start_from = last_id
            data = [
                dict_to_csvrow(d, self.struct) for d in data.get('Subjects')
            ]
            save_csvrows(self.output().path, data, sep=self.sep, quoter="\"")
Ejemplo n.º 3
0
 def run(self):
     query = '{' + QUERY_TMPL.format(0, self.chunk_size) + '}'
     rep_url = build_url_for_report_page(self.rep_name)
     versions = self.versions
     if not versions:
         versions = load_versions(rep_url)
     for vs in versions:
         url = build_url_for_data_page(self.rep_name, self.api_key,
                                       version=vs, query=query)
         data = load_data(url, self.struct, self.columns_filter)
         save_csvrows(self.output().path, data)
Ejemplo n.º 4
0
    def run(self):
        for i, target in enumerate(self.input()):
            self.set_status_message('Parsing {}'.format(target.path))
            rows = parse(target.path,
                         Row,
                         skiprows=self.skiptop,
                         sheets=self.sheets)
            save_csvrows(self.output().path, [attr.astuple(r) for r in rows])

            percent = round((i + 1) * 100 / len(self.input()))
            self.set_progress_percentage(percent)
Ejemplo n.º 5
0
def parse_excel_rect_area_to_csv(xl_fpath,
                                 csv_fpath,
                                 wrapper,
                                 sheets=None,
                                 skiptopnum=None,
                                 usecols=None,
                                 transform_callback=None):
    """ Save records parsed from excel file to csv """

    # get list of sheets
    xl_df = pd.ExcelFile(xl_fpath)
    xl_sheets = xl_df.sheet_names

    _sheets = sheets

    # by default we parse all the sheets
    if not sheets:
        _sheets = [i for i, _ in enumerate(range(len(xl_sheets)))]

    # init skiptoprows
    # by default we always skip one row from the top
    _skiptopnums = [1 for x in range(len(_sheets))]

    # and by now if skiptopnums is given
    # it will be applied only to first sheet
    if skiptopnum:
        _skiptopnums[0] = skiptopnum

    count = 0

    for i, sh in enumerate(_sheets):

        if sh <= len(xl_sheets) - 1:
            df = pd.read_excel(xl_fpath,
                               sheet_name=xl_sheets[sh],
                               skiprows=_skiptopnums[i],
                               usecols=usecols,
                               index_col=None,
                               dtype=str,
                               header=None)
            # convert Excel's empty cells to empty string
            data = df.replace(np.nan, '', regex=True)
            data.dropna(inplace=True)
            rows = [wrapper(*x) for x in data.values]

            if len(rows) > 0:

                if transform_callback:
                    transform_callback(rows)

                save_csvrows(csv_fpath, [attr.astuple(r) for r in rows])
            count += len(rows)

    return count
Ejemplo n.º 6
0
    def run(self):

        rep_url = build_url_for_report_page(self.rep_name)
        versions = self.versions
        if not versions:
            versions = load_versions(rep_url)
        for vs in versions:
            url = build_url_for_data_page(self.rep_name,
                                          DGOV_API_KEY,
                                          version=vs)
            data = load_data(url, Row)
            save_csvrows(self.output().path, data)
Ejemplo n.º 7
0
    def run(self):
        url = build_url_for_report_page(self.report_name)
        version = load_versions(url)[-1]

        parser = DatagovApiParsing(self.api_key, self.report_name, self.struct,
                                   self.chunk_size,
                                   self.output().path)
        d = self.date.strftime(FRMT)
        q = '{"size":%s,"query":{"bool":{"must":[{"match":{"date":"%s"}}]}}}' % (
            self.chunk_size, d)

        data = parser.parse_query_report(version, q)
        save_csvrows(self.output().path, data)
Ejemplo n.º 8
0
def parse_chunk(url, struct, output_fpath, updates_for=None,
                timeout=None, retries=None,
                backoff_factor=None):
    data = []
    try:
        data = load(url, struct, updates_for=updates_for,
                    timeout=timeout, retries=retries,
                    backoff_factor=backoff_factor)
    except Exception:
        raise
    else:
        save_csvrows(output_fpath, data)
        # sleep(10)

    return len(data)
Ejemplo n.º 9
0
    def run(self):
        error_timeout = self.timeout * 3
        headers = dict()
        headers['Authorization'] = self.token

        url = f'{self.url}?limit={self.limit}'
        host = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))

        # we store parsed blocks of data as uris
        # in case reruning we parse last uri
        if os.path.exists(self.parsed_fpath):
            uri = read_lines(self.parsed_fpath).pop()
            url = f'{host}{uri}'

        total = 0
        parsed_count = get_file_lines_count(self.output().path)
        parsed_count = 0 if not parsed_count else parsed_count

        while url:
            try:
                r = get(url, headers=headers, timeout=self.timeout)
            except Exception:
                sleep(error_timeout)
            else:
                response = Box(json.loads(r))
                if response.next_page:
                    url = f'{self.url}?{response.next_page}'
                    append_file(self.parsed_fpath, response.next_page)
                else:
                    url = None

                total = response.total
                raw_items = list(response['items'])
                # data = dict_to_csvrow(raw_items, self.struct)
                data = [dict_to_csvrow(d, self.struct) for d in raw_items]
                save_csvrows(self.output().path, data, quoter="\"")
                parsed_count += self.limit
                sleep(self.timeout)

            self.set_status_message(f'Total: {total}. Parsed: {parsed_count}')
            self.set_progress_percentage(round((parsed_count * 100) / total))

        stat = dict(total=total, parsed=parsed_count)
        append_file(self.success_fpath, str(stat))
Ejemplo n.º 10
0
def parse_report(rep, struct, apikey, output_fpath, parsed_fpath,
                  updates_date=None, version=None, query=None,
                  callback=None):
    # retriev total count
    total = load_total(build_url_for_detail_page(rep, apikey, version, query))

    # get parsed chunks from prs file
    parsed_chunks = []
    if os.path.exists(parsed_fpath):
        parsed_chunks = read_lines(parsed_fpath)

    is_retrying = False
    parsed_chunks_count = 0
    if parsed_chunks:
        parsed_chunks_count = len(parsed_chunks)
        is_retrying = True

    # build chunks considering already parsed chunks
    chunks, total_chunks, parsed_count = prepare_chunks(total, parsed_chunks)

    errors = 0

    with futures.ThreadPoolExecutor(max_workers=3) as ex:
        to_do_map = {}

        for chunk in chunks:
            _chunk = Chunk(*(chunk.split(':')))
            query = query = '{' + QUERY_TMPL.format(_chunk.start, _chunk.size) + '}'

            url = build_url_for_data_page(rep, apikey, version=version, query=query)

            future = ex.submit(load2, url, struct, updates_date)

            to_do_map[future] = chunk

        done_iter = futures.as_completed(to_do_map)

        for future in done_iter:
            try:
                data = future.result()
                start, size, _ = to_do_map[future].split(':')
            except (HTTPError, ConnectionError, Timeout, RetryError, ReadTimeout) as exc:
                print(exc)
                errors += 1
                sleep(TIMEOUT * 2)
            else:

                _chunk = '{}:{}:{}'.format(start, size, len(data))
                print(_chunk)
                parsed_count += len(data)
                save_csvrows(output_fpath, data)
                append_file(parsed_fpath, _chunk)
                sleep(TIMEOUT)

            if callback:
                s, p = prepare_callback_info(total, total_chunks,
                                             parsed_count, errors, parsed_chunks_count,
                                             updates_date, is_retrying)
                callback(s, p)

    if total_chunks != parsed_chunks:
        raise ExternalSourceError("Could not parse all the data. Try again.")

    stata = dict(total=total, parsed_count=parsed_count)
    append_file(success_fpath(output_fpath), json.dumps(stata))
    return parsed_count
Ejemplo n.º 11
0
 def run(self):
     for target in self.input():
         rows = parse(target.path, Row, skiprows=self.skiptop, usecols=self.usecolumns)
         save_csvrows(self.output().path, [attr.astuple(r) for r in rows])
Ejemplo n.º 12
0
    def run(self):

        rows = nb_rates_as_csvrows(self.url)
        save_csvrows(self.output().path, rows)
Ejemplo n.º 13
0
 def run(self):
     d = parse_json_from_js(self.url, self.pattern)
     # wrap each row and get tuple
     rows = [attr.astuple(Row(**_d)) for _d in d]
     save_csvrows(self.output().path, rows)
Ejemplo n.º 14
0
 def run(self):
     rows = parse(self.input().path, Row, skiprows=self.skiptop)
     save_csvrows(self.output().path, [attr.astuple(r) for r in rows])
Ejemplo n.º 15
0
    def parse_report(self, version, dates_range=None, progress_callback=None):

        total_rows = self.get_total_rows_for_version(version)
        all_chunks = get_chunks_start_position(total_rows, self.chunk_size)

        # parsed_chunks = []

        # if self.parsed_fpath:
        parsed_chunks = self.get_parsed_chunks()

        is_retrying = False
        if parsed_chunks:
            is_retrying = True

        chunks = deque(prepare_chunks2(all_chunks, parsed_chunks))

        errors = 0
        parsed_chunks_count = len(parsed_chunks)
        parsed_rows_count = parsed_chunks_count * self.chunk_size

        while chunks:
            chunk = chunks.popleft()
            query = build_query(chunk,
                                self.chunk_size,
                                dates_range=dates_range)
            url = build_url_for_data_page(self.report_name,
                                          self.apikey,
                                          version=version,
                                          query=query)
            print(url)
            try:
                data = load3(url, self.struct)

            except (HTTPError, ConnectionError, Timeout, RetryError,
                    ReadTimeout) as exc:
                print(exc)
                chunks.append(chunk)
                sleep(TIMEOUT * 2)
                errors += 1
            else:

                if (not data) and dates_range:
                    break
                parsed_rows_count += len(data)
                parsed_chunks_count += 1
                save_csvrows(self.output_fpath, data)
                if self.parsed_fpath:
                    append_file(self.parsed_fpath, str(chunk))
                sleep(self.timeout)

            if progress_callback:
                s, p = self._progress_status_info(
                    version,
                    len(all_chunks),
                    errors,
                    parsed_rows_count,
                    parsed_chunks_count,
                    is_retrying=is_retrying,
                )
                progress_callback(s, p)

        if not total_rows:
            raise ExternalSourceError(
                f'Report {self.report_name}:{version} has no data. ')

        return total_rows, parsed_rows_count
Ejemplo n.º 16
0
def parse_dgovbig(rep,
                  struct,
                  apikey,
                  output_fpath,
                  parsed_fpath,
                  updates_date=None,
                  version=None,
                  query=None,
                  callback=None):

    # retriev total count
    total = load_total(build_url_for_detail_page(rep, apikey, version, query))

    # get parsed chunks from prs file
    parsed_chunks = []
    if os.path.exists(parsed_fpath):
        parsed_chunks = read_lines(parsed_fpath)

    is_retrying = False
    parsed_chunks_count = 0
    if parsed_chunks:
        parsed_chunks_count = len(parsed_chunks)
        is_retrying = True

    # build chunks considering already parsed chunks
    chunks, total_chunks, parsed_count = prepare_chunks(total, parsed_chunks)

    errors = 0

    # it's convinient having deque of chunks,
    # cause we can do retry, putting aside failed chunk for later
    chunks = deque(chunks)
    while chunks:
        _ch = chunks.popleft()
        chunk = Chunk(*(_ch.split(':')))
        query = '{' + QUERY_TMPL.format(chunk.start, chunk.size) + '}'

        url = build_url_for_data_page(rep,
                                      apikey,
                                      version=version,
                                      query=query)
        print(url)
        try:
            data = load2(url, struct, updates_date=updates_date)
        except (HTTPError, ConnectionError, Timeout, RetryError,
                ReadTimeout) as exc:
            chunks.append(_ch)
            sleep(TIMEOUT * 2)
            errors += 1
        else:
            _chunk = Chunk(chunk.start, chunk.size, len(data))
            parsed_count += _chunk.count
            parsed_chunks_count += 1
            save_csvrows(output_fpath, data)
            append_file(parsed_fpath, ':'.join((str(ch) for ch in _chunk)))
            sleep(TIMEOUT)
        if callback:
            s, p = prepare_callback_info(total, total_chunks, parsed_count,
                                         errors, parsed_chunks_count,
                                         updates_date, is_retrying)
            callback(s, p)

    # if we have not parsed all chunks
    # we shoud do retry after several time
    if total_chunks != parsed_chunks_count:
        raise ExternalSourceError("Could not parse all chunks. Try again.")

    stata = dict(total=total, parsed_count=parsed_count)
    append_file(success_fpath(output_fpath), json.dumps(stata))
    return parsed_count