def load_news(engine): log.info("Loading news into production DB...") _NewsSource = sl.get_table(engine, 'news') for data in sl.all(engine, _NewsSource): news = NewsItem.query.filter_by( source_url=data.get('source_url')).first() if news is None: news = NewsItem() news.key = data.get('key') news.source_url = data.get('source_url') news.title = data.get('title') news.text = data.get('text') news.date = date(data.get('date')) news.image_url = data.get('image_url') news.image_copyright = data.get('image_copyright') news.image_changed_at = date(data.get('image_changed_at')) if data.get('gremium'): gremium = Gremium.query.filter_by( key=data.get('gremium')).first() if gremium is None: log.error("Gremium %s not found!" % data.get('gremium')) else: news.gremium = gremium db.session.add(news) db.session.commit()
def combine_sheet(engine, resource, sheet_id, table, mapping): begin = time.time() base = { 'resource_id': resource['resource_id'], 'resource_hash': resource['extract_hash'], 'sheet_id': sheet_id, } spending_table = sl.get_table(engine, 'spending') connection = engine.connect() trans = connection.begin() try: rows = 0 sl.delete(connection, spending_table, resource_id=resource['resource_id'], sheet_id=sheet_id) for row in sl.all(connection, table): data = dict(base) for col, value in row.items(): if col == 'id': data['row_id'] = value continue mapped = mapping.get(col) if mapped is not None: data[mapped] = value sl.add_row(connection, spending_table, data) rows += 1 trans.commit() log.info("Loaded %s rows in %s ms", rows, int((time.time() - begin) * 1000)) return rows > 0 finally: connection.close()
def update_network_entities(engine, file_name): log.info("Updating network entities reference sheet: %s", file_name) network_entities = set() table = sl.get_table(engine, 'network_entity') if os.path.exists(file_name): fh = open(file_name, 'rb') reader = csv.DictReader(fh) for d in reader: e = [(k, v.decode('utf-8')) for (k, v) in d.items()] e = dict(e) network_entities.add((e['representativeEtlId'], e['etlFingerPrint'])) sl.upsert(engine, table, e, ['representativeEtlId', 'etlFingerPrint']) fh.close() reps = set([ne[0] for ne in network_entities]) rep_table = sl.get_table(engine, 'representative') for rep in reps: sl.update(engine, rep_table, {'etlId': rep}, {'network_extracted': True}) for row in sl.all(engine, table): network_entities.add((row['representativeEtlId'], row['etlFingerPrint'])) fh = open(file_name, 'wb') writer = None table = sl.get_table(engine, 'network_entity') for ic, fp in network_entities: row = { 'representativeEtlId': ic, 'etlFingerPrint': fp } if writer is None: writer = csv.DictWriter(fh, row.keys()) writer.writerow(dict(zip(row.keys(), row.keys()))) r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()] writer.writerow(dict(r)) fh.close()
def combine_sheet(engine, resource, sheet_id, table, mapping): begin = time.time() base = { 'resource_id': resource['resource_id'], 'resource_hash': resource['extract_hash'], 'sheet_id': sheet_id, } spending_table = sl.get_table(engine, 'spending') connection = engine.connect() trans = connection.begin() try: rows = 0 sl.delete(connection, spending_table, resource_id=resource['resource_id'], sheet_id=sheet_id) for row in sl.all(connection, table): data = dict(base) for col, value in row.items(): if col == 'id': data['row_id'] = value continue mapped = mapping.get(col) if mapped is not None: data[mapped] = value sl.add_row(connection, spending_table, data) rows += 1 trans.commit() log.info("Loaded %s rows in %s ms", rows, int((time.time()-begin)*1000)) return rows > 0 finally: connection.close()
def supplier_names_from_table(engine, resource_id, table_id): global tables_count, rows_count tables_count = tables_count + 1 print "# checking table %d" % tables_count table_suffix = "%s_table%s" % (resource_id, table_id) table = sl.get_table(engine, "spending_%s" % table_suffix) supplier_table = sl.get_table(engine, "suppliers") for row in sl.all(engine, table): rows_count = rows_count + 1 if not row.has_key("SupplierName"): # One of the junk tables that contain no real data, usually excel noise continue supplier = row["SupplierName"] if supplier is None or supplier == "": continue if suppliers_visited.has_key(supplier): continue suppliers_visited[supplier] = True if sl.find_one(engine, supplier_table, original=supplier) is not None: continue yield supplier
def test_format_all(): engine,table = connect() for row in sl.all(engine, table): format_ = partial(format, engine, row['resource_id'], row['table_id'], False) format_.description = \ 'format: %(resource_id)s/%(table_id)s' % row yield format_
def test_condense_all(): engine = db_connect() table = sl.get_table(engine, 'extracted') for row in sl.all(engine, table): for table_id in xrange(0, row['max_table_id'] + 1): condense_ = partial(condense, engine, row['resource_id'], table_id, False) condense_.description = describe(row['resource_id'], table_id) yield condense_
def process(engine, table, key_column, color_column, subkey_column, subcolor_column): children = defaultdict(set) key_colors = {} for row in sl.all(engine, table): key = row.get(key_column) key_colors[key] = row.get(color_column) children[key].add(row.get(subkey_column)) sub_colors = dict() for key in children.keys(): gen = _color_range(key_colors.get(key), len(children[key])) for sub, col in zip(children[key], gen): sub_colors[(key, sub)] = col for row in sl.all(engine, table): ks = (row.get(key_column), row.get(subkey_column)) row[subcolor_column] = sub_colors[ks] #pprint(row) sl.upsert(engine, table, row, ['id'])
def process_rows(handlefunc, engine=None): if engine is None: engine = make_engine() table = sl.get_table(engine, 'fts') for row in sl.all(engine, table): out = handlefunc(row) sl.upsert(engine, table, out, ['id']) return table
def condense(engine, resource_id, table_id, force): table_suffix = '%s_table%s' % (resource_id, table_id) if not engine.has_table('raw_%s' % table_suffix): return condensed_table = sl.get_table(engine, 'condensed') # Skip over tables we have already extracted if not force and sl.find_one(engine, condensed_table, resource_id=resource_id, table_id=table_id) is not None: return connection = engine.connect() trans = connection.begin() start = time.time() try: raw_table = sl.get_table(connection, 'raw_%s' % table_suffix) sl.drop_table(connection, 'spending_%s' % table_suffix) spending_table = sl.get_table(connection, 'spending_%s' % table_suffix) columns_table = sl.get_table(connection, 'column_sets') normalise_map = normalised_columns_map(raw_table) normalised_headers = ','.join(sorted(normalise_map.values())) mapping_row = sl.find_one(connection, columns_table, normalised=normalised_headers) if mapping_row is None or not mapping_row.get('valid'): # This table is unmapped, cannot be condensed return column_mapping = json.loads(mapping_row['column_map']) # Build the final mapping from input column to output column mapping = {} for k,n in normalise_map.iteritems(): if n in column_mapping and column_mapping[n] is not None and len(column_mapping[n]) > 0: mapping[k] = column_mapping[n] for row in sl.all(connection, raw_table): spending_row = {} for key, value in row.items(): if key not in mapping: continue if not value or not len(value.strip()): continue if mapping[key] in spending_row: continue spending_row[mapping[key]] = value.strip() #print spending_row sl.add_row(connection, spending_table, spending_row) sl.upsert(connection, condensed_table, {'resource_id': resource_id, 'table_id': table_id, 'condense_time': time.time() - start, }, ['resource_id', 'table_id']) trans.commit() finally: connection.close()
def merge_votes(engine): _referenzen = defaultdict(set) referenz_tbl = sl.get_table(engine, "referenz") for ref in sl.all(engine, referenz_tbl): if ref["typ"] == "drs": # print ref _referenzen[ref["nummer"]].add(ref["ablauf_key"]) mref = dict([(c, d) for c, d in _referenzen.items() if len(d) > 1]) print len(_referenzen), len(mref)
def do_format(engine, resource_id, table_id): table_suffix = '%s_table%s' % (resource_id, table_id) table = sl.get_table(engine, 'spending_%s' % table_suffix) date_field_values = defaultdict(list) for row in sl.all(engine, table): for date_field in DATE_FIELDS: if date_field in row and row[date_field]: date_field_values[date_field].append(row[date_field]) date_field_formats = {} for date_field, values in date_field_values.items(): date_field_formats[date_field] = detect_date_format(values) for row in sl.all(engine, table): for numeric_field in NUMERIC_FIELDS: try: val = row.get(numeric_field) if val is None: continue val = "".join([v for v in val if v in "-.0123456789"]) row[numeric_field + 'AsFloat'] = float(val) except Exception as e: print e for date_field, format_ in date_field_formats.items(): if format_ is None: continue try: if row[date_field] is None: continue if format_ == 'excel': # Deciphers excel dates that have been mangled into integers by formatting errors parsed = datetime(*xldate_as_tuple(float(row[date_field].strip()), 0)) else: parsed = datetime.strptime(row[date_field].strip(), format_) row[date_field + 'AsDate'] = parsed.strftime("%Y-%m-%d") except Exception as e: print e sl.upsert(engine, table, row, ['id'])
def create_entities(engine): log.info("De-normalizing global entities collection...") table = sl.get_table(engine, 'entity') for tbl in ['representative', 'person', 'financialDataTurnover', 'organisation', 'network_entity']: for row in sl.all(engine, sl.get_table(engine, tbl)): entity = {'etlFingerPrint': row.get('etlFingerPrint')} entity['legalStatus'] = row.get('legalStatus', '') entity['countryCode'] = row.get('contactCountryCode', '') entity['etlTable'] = tbl sl.upsert(engine, table, entity, ['etlFingerPrint', 'etlTable'])
def load_persons(engine): log.info("Loading persons into production DB...") db.session.commit() db.session.execute(obleute.delete()) db.session.execute(mitglieder.delete()) db.session.execute(stellvertreter.delete()) _PersonSource = sl.get_table(engine, 'person') for data in sl.all(engine, _PersonSource): person = Person.query.filter_by( fingerprint=data.get('fingerprint')).first() if person is None: person = Person() person.slug = data.get('slug') person.fingerprint = data.get('fingerprint') log.info(" -> %s" % person.fingerprint) person.source_url = data.get('source_url') person.mdb_id = data.get('mdb_id') person.vorname = data.get('vorname') person.nachname = data.get('nachname') person.adelstitel = data.get('adelstitel') person.titel = data.get('titel') person.ort = data.get('ort') person.geburtsdatum = data.get('geburtsdatum') person.religion = data.get('religion') person.hochschule = data.get('hochschule') person.beruf = data.get('beruf') person.berufsfeld = data.get('berufsfeld') person.geschlecht = data.get('geschlecht') person.familienstand = data.get('familienstand') person.kinder = data.get('kinder') person.partei = data.get('partei') person.land = data.get('land') person.bio_url = data.get('bio_url') person.bio = data.get('bio') person.wissenswertes = data.get('wissenswertes') person.homepage_url = data.get('homepage_url') person.telefon = data.get('telefon') person.homepage_url = data.get('homepage_url') person.angaben = data.get('angaben') person.foto_url = data.get('foto_url') person.foto_copyright = data.get('foto_copyright') person.reden_plenum_url = data.get('reden_plenum_url') person.reden_plenum_rss_url = data.get('reden_plenum_rss_url') person.twitter_url = data.get('twitter_url') person.facebook_url = data.get('facebook_url') person.awatch_url = data.get('awatch_url') db.session.add(person) db.session.flush() mdb_rolle = load_rollen(engine, person, data) load_gremium_mitglieder(engine, person, mdb_rolle) db.session.commit()
def sources_metadata(engine): sources = {} log.info("Building sources index...") source_table = sl.get_table(engine, 'source') for source in sl.all(engine, source_table): data = { 'SourceDatasetName': source.get('package_name'), 'SourceDatasetID': source.get('package_id'), 'SourceDatasetTitle': source.get('package_title'), 'SourcePublisherName': source.get('publisher_name'), 'SourcePublisherTitle': source.get('publisher_title'), 'SourceID': source.get('resource_id'), 'SourceURL': source.get('url'), 'SourceFormat': source.get('format'), } sources[source['resource_id']] = data return sources
def referenzen_index(engine): _referenzen = defaultdict(set) referenz_tbl = sl.get_table(engine, 'referenz') log.info("Building index of ablauf document references per session...") for ref in sl.all(engine, referenz_tbl): _referenzen[ref['ablauf_key']].add((ref['typ'], ref['nummer'])) referenzen = defaultdict(set) for ablauf, refs in _referenzen.items(): for typ, num in refs: if not '/' in num: continue if typ == 'plpr': wp, session= num.split('/') drs = [n for t, n in refs if t != 'plpr'] if len(drs): referenzen[(ablauf,wp,session)] = drs #pprint(dict(referenzen.items())) return referenzen
def load_gremien(engine): log.info("Loading gremien into production DB...") _GremiumSource = sl.get_table(engine, 'gremium') for data in sl.all(engine, _GremiumSource): gremium = Gremium.query.filter_by(key=data.get('key')).first() if gremium is None: gremium = Gremium() gremium.key = data.get('key') gremium.source_url = data.get('source_url') gremium.name = data.get('name') gremium.typ = data.get('type') gremium.url = data.get('url') gremium.aufgabe = data.get('aufgabe') gremium.rss_url = data.get('rss_url') gremium.image_url = data.get('image_url') gremium.image_copyright = data.get('image_copyright') db.session.add(gremium) db.session.commit()
def referenzen_index(engine): _referenzen = defaultdict(set) referenz_tbl = sl.get_table(engine, 'referenz') log.info("Building index of ablauf document references per session...") for ref in sl.all(engine, referenz_tbl): _referenzen[ref['ablauf_key']].add((ref['typ'], ref['nummer'])) referenzen = defaultdict(set) for ablauf, refs in _referenzen.items(): for typ, num in refs: if not '/' in num: continue if typ == 'plpr': wp, session = num.split('/') drs = [n for t, n in refs if t != 'plpr'] if len(drs): referenzen[(ablauf, wp, session)] = drs #pprint(dict(referenzen.items())) return referenzen
def referenzen_index(engine): _referenzen = defaultdict(set) referenz_tbl = sl.get_table(engine, "referenz") log.info("Building index of ablauf document references per session...") for ref in sl.all(engine, referenz_tbl): _referenzen[ref["ablauf_key"]].add((ref["typ"], ref["nummer"])) referenzen = defaultdict(set) for ablauf, refs in _referenzen.items(): for typ, num in refs: if not "/" in num: continue if typ == "plpr": wp, session = num.split("/") drs = [n for t, n in refs if t != "plpr"] if len(drs): referenzen[(ablauf, wp, session)] = drs # pprint(dict(referenzen.items())) return referenzen
def update_entities(engine, file_name): log.info("Updating entities reference sheet: %s", file_name) data = {} if os.path.exists(file_name): fh = open(file_name, 'rb') reader = csv.DictReader(fh) data = {} for d in reader: e = [(k, v.decode('utf-8')) for (k, v) in d.items()] e = dict(e) data[e['etlFingerPrint']] = e fh.close() print len(data) fh = open(file_name, 'wb') writer = None table = sl.get_table(engine, 'entity') for row in sl.all(engine, table): fp = row['etlFingerPrint'] if fp is None: continue if not row.get('canonicalName'): row['canonicalName'] = row['etlFingerPrint'] row['canonicalName'] = cleanCanonical(row['canonicalName']) entity = data.get(fp) if entity and entity.get('canonicalName') and \ fp != entity.get('canonicalName'): #print entity.get('canonicalName').encode('utf-8') row['canonicalName'] = entity.get('canonicalName') out = row.copy() del out['id'] sl.upsert(engine, table, out, ['etlFingerPrint']) cn = row['canonicalName'] row['normalizedForm'] = normalize_text(cn) row['reverseForm'] = reverse_normalize(cn) if writer is None: writer = csv.DictWriter(fh, row.keys()) writer.writerow(dict(zip(row.keys(), row.keys()))) r = [(k, unicode(v).encode('utf-8')) for k, v in row.items()] writer.writerow(dict(r)) fh.close()
import sqlaload as sl import csv import sys import logging from common import * if __name__ == "__main__": logging.basicConfig() logging.getLogger(__name__).setLevel(logging.DEBUG) logging.getLogger("sqlalchemy.engine").setLevel(logging.WARN) supplier_map = {} engine = db_connect() supplier_table = sl.get_table(engine, "suppliers") for row in sl.all(engine, supplier_table): supplier_map[row["original"]] = row writer = csv.DictWriter( open(sys.argv[1], "wb"), [ "SupplierName", "DepartmentFamily", "Entity", "ExpenseType", "ExpenseArea", "VATNumber", "AmountAsFloat", "DateAsDate", "SupplierGuess", "SupplierGuessURI",
def process(engine, table): gpl = load_csv_keys('gruppierungsplan.csv', 'ID') fpl = load_csv_keys('funktionenplan.csv', 'ID') for row in sl.all(engine, table): sl.upsert(engine, table, for_row(row, fpl, gpl), ['id'])
def dump_table(engine, table): file_name = '%s-%s.csv' % (table.name, datetime.utcnow().strftime("%Y-%m-%d")) fh = open(file_name, 'wb') sl.dump_csv(sl.all(engine, table), fh)
def test_retrieve_all(): engine,src_table = connect() for row in sl.all(engine, src_table): f = partial(retrieve, row, engine, False) f.description = describe(row) yield f,
def cluster(engine): for row in sl.all(engine, sl.get_table(engine, 'entity')): print normalize_text(row['etlFingerPrint']).encode('utf-8')
def supplier_tables(engine, table): for row in sl.all(engine, table): yield supplier_names_from_table(engine, row["resource_id"], row["table_id"])