コード例 #1
0
    def calculate_hash(self, spec):

        cache_hash = None
        if spec.pipeline_id in self.all_pipeline_ids:
            message = 'Duplicate key {0} in {1}' \
                .format(spec.pipeline_id, spec.abspath)
            spec.errors.append(SpecError('Duplicate Pipeline Id', message))

        else:
            cache_hash = resolve_dependencies(spec, self.all_pipeline_ids)
            if len(spec.errors) > 0:
                return cache_hash

            for step in spec.pipeline_details['pipeline']:
                m = hashlib.md5()
                m.update(cache_hash.encode('ascii'))
                with open(step['executor'], 'rb') as f:
                    m.update(f.read())
                m.update(
                    json.dumps(step, ensure_ascii=True,
                               sort_keys=True).encode('ascii'))
                cache_hash = m.hexdigest()
                step['_cache_hash'] = cache_hash

            self.all_pipeline_ids[spec.pipeline_id] = spec

        spec.cache_hash = cache_hash
コード例 #2
0
def get_office_data(row, page, documents):
    input_fields_text_map = {
        "publication_id": "SERIAL_NUMBER",
        "publishnum": "PublishNum",
        "description": "PublicationName",
        "publisher": "Publisher",
        "claim_date": "ClaimDate",
        "last_update_date": "UpdateDate",
        "subjects": "PublicationSUBJECT",
        "publish_date": "PublishDate",
        "status": "PublicationSTATUS"
    }
    source_data = {
        k: page("#ctl00_PlaceHolderMain_lbl_{}".format(v)).text()
        for k, v in input_fields_text_map.items()
    }
    publication_id = publication_id_from_url(row["url"])
    if str(publication_id) != str(source_data["publication_id"]):
        raise Exception("invalid or blocked response")
    return {
        "publisher_id":
        int(row["id"]),
        "publication_id":
        publication_id,
        "tender_type":
        "office"
        if source_data['publisher'] != 'משרד האוצר - מינהל הרכש הממשלתי' else
        'central',
        "page_url":
        row["url"],
        "description":
        source_data["description"],
        # "supplier_id": None,
        # "supplier": None,
        # "contact": None,
        "publisher":
        source_data["publisher"],
        # "contact_email": None,
        "claim_date":
        parse_datetime(source_data["claim_date"]),
        "last_update_date":
        parse_date(source_data["last_update_date"]),
        # "reason": None,
        # "source_currency": None,
        # "regulation": None,
        # "volume": None,
        "subjects":
        source_data["subjects"],
        "start_date":
        parse_date(source_data["publish_date"]),
        # "end_date": None,
        "decision":
        source_data["status"],
        # "page_title": None,
        "tender_id":
        source_data["publishnum"] or 'none',
        "documents":
        json.dumps(documents, sort_keys=True, ensure_ascii=False),
    }
コード例 #3
0
 def __setitem__(self, key, value):
     conn = sqlite3.connect(self.filename)
     value = json.dumps(value)
     cursor = conn.cursor()
     cursor.execute('DELETE FROM d where _key=?', (key, ))
     cursor.execute('INSERT INTO d VALUES (?,?)', (key, value))
     conn.commit()
     conn.close()
コード例 #4
0
 def set(self, key, value):
     value = json.dumps(value)
     try:
         self.get(key)
         self.cursor.execute('''UPDATE d SET value=? WHERE key=?''',
                             (value, key))
     except KeyError:
         self.cursor.execute('''INSERT INTO d VALUES (?, ?)''',
                             (key, value))
     self.db.commit()
コード例 #5
0
 def get_exemptions_data(self, row, page, documents):
     input_fields_text_map = {
         "publication_id": "SERIAL_NUMBER",
         "description": "PublicationName",
         "supplier_id": "SupplierNum",
         "supplier": "SupplierName",
         "contact": "ContactPersonName",
         "publisher": "PUBLISHER",
         "contact_email": "ContactPersonEmail",
         "claim_date": "ClaimDate",
         "last_update_date": "UpdateDate",
         "reason": "PtorReason",
         "source_currency": "Currency",
         "regulation": "Regulation",
         "volume": "TotalAmount",
         "subjects": "PublicationSUBJECT",
         "start_date": "StartDate",
         "end_date": "EndDate",
         "decision": "Decision",
         "page_title": "PublicationType",
     }
     source_data = {
         k: page("#ctl00_PlaceHolderMain_lbl_{}".format(v)).text()
         for k, v in input_fields_text_map.items()
     }
     publication_id = publication_id_from_url(row["url"])
     if str(publication_id) != str(source_data["publication_id"]):
         raise Exception("invalid or blocked response")
     return {
         "publisher_id": int(row["pid"]),
         "publication_id": publication_id,
         "tender_type": "exemptions",
         "page_url": row["url"],
         "description": source_data["description"],
         "supplier_id": source_data["supplier_id"],
         "supplier": source_data["supplier"],
         "contact": source_data["contact"],
         "publisher": source_data["publisher"],
         "contact_email": source_data["contact_email"],
         "claim_date": parse_datetime(source_data["claim_date"]),
         "last_update_date": parse_date(source_data["last_update_date"]),
         "reason": source_data["reason"],
         "source_currency": source_data["source_currency"],
         "regulation": source_data["regulation"],
         "volume": source_data["volume"],
         "subjects": source_data["subjects"],
         "start_date": parse_date(source_data["start_date"]),
         "end_date": parse_date(source_data["end_date"]),
         "decision": source_data["decision"],
         "page_title": source_data["page_title"],
         "tender_id": "none",
         "documents": json.dumps(documents,
                                 sort_keys=True,
                                 ensure_ascii=False)
     }
コード例 #6
0
def _tostr(value):
    if isinstance(value, str):
        return value
    elif value is None:
        return ''
    elif isinstance(value, (int, float, bool, Decimal)):
        return str(value)
    elif isinstance(value, date):
        return value.isoformat()
    elif isinstance(value, (list, dict)):
        return json.dumps(value)

    assert False, "Internal error - don't know how to handle %r of type %r" % (value, type(value))
コード例 #7
0
async def events(request: web.Request):
    loop = request.app.loop

    uuid = request.match_info['id']
    async with sse_response(request, headers=CORS_HEADERS) as resp:
        try:
            async with ProcessRunner(loop, uuid) as process:
                print('starting!', uuid)
                async for line in LineReader(process.stderr):
                    if line is None:
                        continue
                    resp.send(line)
                print('done!', uuid)
                resp.send('close')
        except Exception as e:
            msg = 'General Error %s' % e
            resp.send(json.dumps({'e': 'err', 'msg': msg, 'uuid': 'general'}))
    return resp
コード例 #8
0
 def get_central_data(self, row, page, documents):
     # michraz_number = page("#ctl00_PlaceHolderMain_MichraznumberPanel div.value").text().strip()
     documents = []
     for elt in page("#ctl00_PlaceHolderMain_SummaryLinksPanel_SummaryLinkFieldControl1__ControlWrapper_SummaryLinkFieldControl a"):
         documents.append({"description": ' '.join(elt.text.strip().split()),
                           "link": elt.attrib["href"],
                           "update_time": None})
     for elt in page("#ctl00_PlaceHolderMain_SummaryLinks2Panel"):
         documents.append({"description": ' '.join(pq(elt).text().strip().split()),
                           "link": pq(elt).find("a")[0].attrib["href"],
                           "update_time": None})
     publication_id = page("#ctl00_PlaceHolderMain_ManofSerialNumberPanel div.value").text().strip()
     outrow = {
         "publisher_id": None,
         "publication_id": int(publication_id) if publication_id else 0,
         "tender_type": "central",
         "page_url": row["url"],
         "description": page("#ctl00_PlaceHolderMain_GovXContentSectionPanel_Richhtmlfield1__ControlWrapper_RichHtmlField").text().strip(),
         "supplier_id": None,
         "supplier": page("#ctl00_PlaceHolderMain_GovXParagraph1Panel_ctl00__ControlWrapper_RichHtmlField div").text().strip(),
         "contact": page("#ctl00_PlaceHolderMain_WorkerPanel_WorkerPanel1 div.worker").text().strip(),
         "publisher": None,
         "contact_email": None,
         "claim_date": None,
         "last_update_date": None,
         "reason": None,
         "source_currency": None,
         "regulation": page("#ctl00_PlaceHolderMain_MIchrazTypePanel div.value").text().strip(),
         "volume": None,
         "subjects": page("#ctl00_PlaceHolderMain_MMDCategoryPanel div.value").text().strip(),
         "start_date": None,
         "end_date": parse_date(page("#ctl00_PlaceHolderMain_TokefEndDatePanel div.Datevalue").text().strip()),
         "decision": page("#ctl00_PlaceHolderMain_MichrazStatusPanel div.value").text().strip(),
         "page_title": page("h1.MainTitle").text().strip(),
         "tender_id": tender_id_from_url(row["url"]),
         "documents": json.dumps(documents, sort_keys=True, ensure_ascii=False),
     }
     if outrow["description"] == "" and outrow["supplier"] == "" and outrow["subjects"] == "":
         raise Exception("invalid or blocked response")
     return outrow
コード例 #9
0
from datapackage_pipelines.utilities.extended_json import json

from datapackage_pipelines.wrapper import spew, ingest

parameters, datapackage, res_iter = ingest()
res_name = parameters.get('resource', datapackage['resources'][0]['name'])


def show_sample(res):
    logging.info('SAMPLE OF LINES from %s', res.spec['name'])
    for i, row in enumerate(res):
        if i < 10:
            if isinstance(row, LazyJsonLine):
                logging.info('#%s: %s', i, row._evaluate())
            else:
                logging.info('#%s: %r', i, row)
        yield row


def process_resources(res_iter_):
    for res in res_iter_:
        logging.info('? from %s', res.spec['name'])
        if res.spec['name'] == res_name:
            yield show_sample(res)
        else:
            yield res


logging.info(json.dumps(datapackage, indent=2))

spew(datapackage, process_resources(res_iter))
コード例 #10
0
 def set_status(self, pipeline_id, status):
     if self.is_init():
         self.redis.set(pipeline_id, json.dumps(status, ensure_ascii=True))
コード例 #11
0
def jsonize(obj):
    return json.dumps(obj)
コード例 #12
0
def calc_equivs(cur_year, rows, connected_items, new_connected_items,
                to_delete):

    # rows = list(rows)
    # logging.info('cur_year: %r, num rows = %d, prev_year=%d', cur_year, len(rows), len(list(connected_items.iterator())))
    # logging.info('connected_items: %r', connected_items)
    # logging.info('new_connected_items: %r', new_connected_items)

    mapped_levels = {}
    unmatched = []
    for row in rows:
        row = normalize(row)
        equivs = []
        parent = row['parent']
        children = row['children']

        ids = [{'code': row['code'], 'title': row['title']}]
        while len(ids) > 0:
            logging.debug('%d/%r: ids: %r', cur_year, row['code'], ids)
            id = ids.pop(0)

            test_value = sum(
                abs(row[f])
                for f in ('net_allocated', 'gross_allocated', 'net_revised',
                          'commitment_allocated', 'net_used')
                if row.get(f) is not None)
            non_repeating = row.get('non_repeating', [])
            non_repeating = '1' in non_repeating and len(non_repeating) == 1
            if (test_value == 0
                    and not row['code'].endswith('99')) or non_repeating:
                unmatched.append(row)
                row = None
                break

            # Find curated record for id
            curated_items = curated.get((cur_year, id['code']))
            if curated_items is not None:
                if len(curated_items) == 0:
                    unmatched.append(row)
                    row = None
                    break

                for year, code in curated_items:
                    assert year == cur_year - 1
                    value = get(connected_items, code)
                    if value is not None:
                        equivs.append(value)
                    else:
                        logging.warning(
                            '%d/%s: Failed to find curated item %s/%s',
                            cur_year, id['code'], year, code)
                if len(equivs) > 0:
                    logging.debug('FOUND CURATED ITEM for %r', id)
                    continue
                else:
                    logging.warning('FOUND 0 CURATED ITEMS for %r', id)

            # Find connected item with same code and title
            connected_item = get(connected_items, id['code'])
            if connected_item is not None:
                if similar(id['title'], connected_item['title']):
                    logging.debug('FOUND EXACT ITEM for %r', id)
                    equivs.append(connected_item)
                    continue

            # Try to find similar named items which moved to a new parent
            if parent is not None:
                connected_item = get(new_connected_items, parent)
                if connected_item is not None:
                    parent = None
                    assert connected_item['year'] == cur_year
                    prev_year_rows = connected_item['history'].get(
                        cur_year - 1, [])
                    candidates = []
                    for prev_year_row in prev_year_rows:
                        prev_year_children = prev_year_row['children']
                        if prev_year_children is None:
                            continue
                        for prev_year_child in prev_year_children:
                            if similar(prev_year_child['title'], id['title']):
                                candidates.append(prev_year_row)
                    if len(candidates) == 1:
                        connected_item = get(connected_items,
                                             candidates[0]['code'])
                        if connected_item is not None:
                            logging.debug('FOUND MOVED ITEM for %r', id)
                            equivs.append(connected_item)
                            continue

            # Split into children
            if children is not None and len(children) > 0:
                logging.debug('SPLITTING TO CHILDREN for %r', id)
                ids.extend({
                    'code': x['code'],
                    'title': x['title']
                } for x in children)
                children = None
                continue

            # Couldn't find match - no point in continuing
            logging.debug('FAILED TO FIND MATCH for %s/%s', cur_year, id)
            unmatched.append(row)
            row = None
            break

        # Found match
        if row is not None:
            assert len(equivs) > 0
            new_history = {}
            # logging.info(', '.join(x['code'] for x in equivs))
            codes = set()
            for equiv in equivs:
                if equiv['code'] in codes:
                    continue
                codes.add(equiv['code'])
                s = mapped_levels.setdefault(equiv['code'], set())
                if len(row['code']) in s:
                    logging.warning('DOUBLE BOOKING for %s/%s from %s/%s',
                                    equiv['year'], equiv['code'], row['year'],
                                    row['code'])
                    for nci in iterate_values(new_connected_items):
                        for hist_item in nci.get('history',
                                                 {}).get(equiv['year'], []):
                            if hist_item['code'] == equiv['code']:
                                logging.warning('FOUND')
                                logging.warning('%s', json.dumps(nci,
                                                                 indent=2))
                else:
                    s.add(len(row['code']))
                to_delete.add(equiv['code'])
                for year, hist_item in equiv['history'].items():
                    update_equiv(new_history.setdefault(year, {}), hist_item)
                update_equiv(new_history.setdefault(equiv['year'], {}), equiv)
            row['history'] = new_history
            put(new_connected_items, row['code'], row)

    logging.error('UNMATCHED %d: %d', cur_year, len(unmatched))

    return unmatched
コード例 #13
0
def put(db, key, value):
    assert value is not None
    enc = json.dumps(value)
    db.put(key.encode('utf8'), enc.encode('ascii'))
コード例 #14
0
def get_central_data(row, page, documents):
    # michraz_number = page("#ctl00_PlaceHolderMain_MichraznumberPanel div.value").text().strip()
    outrow = copy.deepcopy(row)
    publication_id = page(
        "#ctl00_PlaceHolderMain_ManofSerialNumberPanel div.value").text(
        ).strip()
    if publication_id:
        ot_url = BASE_URL + '/officestenders/Pages/officetender.aspx?pID={}'.format(
            publication_id)
        ot_page = pq(_get_url_response_text(ot_url))
        documents = extract_documents(ot_page)
        outrow['url'] = ot_url
        outrow['id'] = -1
        outrow = get_office_data(outrow, ot_page, documents)
    else:
        logging.info('no publication id, continuing')
    dd = []
    for elt in page(
            "#ctl00_PlaceHolderMain_SummaryLinksPanel_SummaryLinkFieldControl1__ControlWrapper_SummaryLinkFieldControl a"
    ):
        link = elt.attrib["href"]
        if 'officestenders/Pages/officetender.aspx?pID=' not in link:
            dd.append((' '.join(elt.text.strip().split()), link))
    for elt in page("#ctl00_PlaceHolderMain_SummaryLinks2Panel"):
        link = pq(elt).find("a")[0].attrib["href"]
        if 'officestenders/Pages/officetender.aspx?pID=' not in link:
            dd.append((' '.join(pq(elt).text().strip().split()), link))
    documents.extend(
        dict(description=d[0], link=d[1], update_time=None) for d in dd)
    description = ' '.join([
        page(
            "#ctl00_PlaceHolderMain_GovXContentSectionPanel_Richhtmlfield1__ControlWrapper_RichHtmlField"
        ).text().strip(),
        page(
            "#ctl00_PlaceHolderMain_GovXParagraph1Panel_ctl00__ControlWrapper_RichHtmlField div"
        ).text().strip()
    ]).strip()
    outrow.update({
        "publication_id":
        int(publication_id) if publication_id else 0,
        "tender_type":
        "central",
        "page_url":
        row["url"],
        "description":
        description,
        "contact":
        page("#ctl00_PlaceHolderMain_WorkerPanel_WorkerPanel1 div.worker"
             ).text().strip(),
        "regulation":
        page("#ctl00_PlaceHolderMain_MIchrazTypePanel div.value").text().strip(
        ),
        "subjects":
        page("#ctl00_PlaceHolderMain_MMDCategoryPanel div.value").text().strip(
        ),
        "end_date":
        (parse_date(
            page("#ctl00_PlaceHolderMain_TokefEndDatePanel div.Datevalue").
            text().strip())
         or parse_date(
             page("#ctl00_PlaceHolderMain_HoraatShaaEndDatePanel div.Datevalue"
                  ).text().strip())),
        "start_date": (outrow.get('start_date') or parse_date(
            page(
                "#ctl00_PlaceHolderMain_HoraatShaaStartDatePanel div.Datevalue"
            ).text().strip())),
        "decision":
        page("#ctl00_PlaceHolderMain_MichrazStatusPanel div.value").text(
        ).strip(),
        "page_title":
        page("h1.MainTitle").text().strip(),
        "tender_id":
        tender_id_from_url(row["url"]),
        "documents":
        json.dumps(documents, sort_keys=True, ensure_ascii=False),
    })
    if not any(outrow.get(x) for x in ("description", "supplier", "subjects")):
        raise Exception("invalid or blocked response")
    return outrow
コード例 #15
0
 def _send(self, msg):
     msg['uuid'] = self.uuid
     if only_last and self.uuid == 'last':
         logging.info(json.dumps(msg))
     elif not only_last and self.uuid != 'last':
         logging.info(json.dumps(msg))