def parse_file(path): with open(path, 'rb') as fh: ctx = json.load(fh) if ctx['source_name'] not in ['MZ']: return all_name = slugify('%(source_name)s flexicadastre' % ctx, sep='_') all_tbl = database[all_name] all_tbl.delete() layers = ctx.pop('layers') for layer in layers: lctx = ctx.copy() lctx['layer_name'] = layer['name'] lctx['layer_id'] = layer['id'] del lctx['rest_url'] tbl_name = slugify('%(source_name)s %(layer_name)s' % lctx, sep='_') tbl = database[tbl_name] tbl.delete() features = layer['data']['features'] print ' -> Generating:', tbl_name print ' ', layer['name'], layer['id'], len(features) for feature in features: attrs = convrow(feature.get('attributes')) attrs.update(lctx) tbl.insert(attrs) all_tbl.insert(attrs)
def scrape_company(self, data): if self.COMPANIES_SCRAPED < self.COMPANY_OFFSET: self.COMPANIES_SCRAPED += 1 logging.debug('skipping %s' % data.get('code', 'unknown')) return if self.COMPANIES_SCRAPED > self.MAX_COMPANIES + self.COMPANY_OFFSET: logging.info('finished companies at no. %s' % self.COMPANIES_SCRAPED) return self.COMPANIES_SCRAPED += 1 logging.info('scraping %s' % data) url = API_URL % data.get('ASX code') data.update(requests.get(url).json()) if 'code' not in data: return data['Stock Info URL'] = url data.pop('ASX code', None) data.pop('primary_share', None) data.pop('last_dividend', None) data.pop('latest_annual_reports', None) data.pop('products', None) record = {} for k, v in data.items(): record[slugify(k, sep='_')] = v category = slugify(record['gics_industry_group']) if category not in ['materials', 'energy']: logging.info('skipping category %s' % category) return self.scrape_announcements(data)
def parse_file(path): with open(path, 'rb') as fh: ctx = json.load(fh) #if ctx['source_name'] not in ['MZ']: # return layers = ctx.pop('layers') for layer in layers: lctx = ctx.copy() lctx['layer_name'] = layer['name'] lctx['layer_id'] = layer['id'] lctx.pop('rest_url', None) tbl_name = slugify('%(source_name)s %(layer_name)s' % lctx, sep='_') tbl = database[tbl_name] tbl.delete() features = layer['data']['features'] print ' -> Generating:', tbl_name print ' ', layer['name'], layer['id'], len(features) for feature in features: attrs = convrow(feature.get('attributes')) attrs.update(lctx) tbl.insert(attrs) dataset.freeze(tbl, prefix=DEST_PATH, filename='%s.csv' % tbl_name, format='csv')
def store_layer_to_db(data, layer, features): """Load a layer of features into a database table.""" # table names are generated from the name of the layer and # the name of the country. tbl_name = '%s %s' % (data['name'], layer['name']) tbl_name = slugify(tbl_name, sep='_') log.info(' -> %s: %s rows', tbl_name, len(features)) tbl = database[tbl_name] # clear out all existing data. tbl.delete() rows = [] types = {} for feature in features: row = convrow(feature['attributes']) for k, v in row.items(): if isinstance(v, (int, long)): types[k] = BigInteger row['layer_name'] = layer['name'] row['layer_id'] = layer['id'] row['source_name'] = data['name'] row['source_title'] = data['title'] row['source_url'] = data['url'] if QUERY['returnGeometry'] == 'true': # store the geometry as JSON. not sure this is a # great idea because it may make the resulting # CSV files really hard to parse. row['_geometry'] = json.dumps(feature['geometry']) row['_attributes'] = json.dumps(feature['attributes']) rows.append(row) tbl.insert_many(rows, types=types) # Dump the table to a CSV file csv_file = '%s.csv' % tbl_name log.info(' -> %s', csv_file) dataset.freeze(tbl, prefix=DATA_PATH, filename=csv_file, format='csv')
def crawl(self, directory=None, collection=None, meta={}): collection = collection or directory collection = Collection.create({ 'foreign_id': 'directory:%s' % slugify(collection), 'label': collection }) db.session.commit() collection_id = collection.id if os.path.isfile(directory): self.crawl_file(collection_id, directory, meta) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): dirparts = [d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES] if len(dirparts): continue log.info("Descending: %r", dirname) for file_name in files: dirname = string_value(dirname) file_name = string_value(file_name) if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) self.crawl_file(collection_id, file_path, meta)
def make_filename(source, sep='-'): if source is not None: source = os.path.basename(source) slugs = [slugify(s, sep=sep) for s in source.split('.')] source = '.'.join(slugs) source = source.strip('.').strip(sep) return source
def by_country(self, country): country_slug = slugify(country) for rig_data in self.rigs_data: if slugify(rig_data.get('country')) != country_slug: continue rig_slug = self.make_entity(rig_data['name'], 'rig', raw=rig_data) rig = self.entities[('rig', rig_slug)] for role in ['owner', 'operator', 'manager']: rig[role] = self.make_entity(rig_data.get(role), 'company') rig['flag'] = self.make_entity(rig_data.get('flag'), 'rflag') rig['location'] = self.make_entity(rig_data.get('country'), 'location') return {'entities': self.entities.values()}
def parse_file(path): with open(path, 'rb') as fh: ctx = json.load(fh) if ctx['source_name'] not in ['TZ']: return for layer in ctx.get('layers'): out = { "type": "FeatureCollection", "features": [] } for fdata in layer.pop('data').get('features'): attrs = get_attrs(fdata) if not fdata.get('geometry', {}).get('rings'): continue props = dict(attrs) props['layer'] = layer.get('name') feature = { 'type': 'Feature', 'geometry': { 'type': 'Polygon', 'coordinates': fdata.get('geometry', {}).get('rings') }, 'properties': props } out['features'].append(feature) name = slugify('%s %s' % (ctx['source_name'], layer.get('name')), sep='_') name = name + '.json' with open(os.path.join(DEST_PATH, name), 'wb') as fh: json.dump(out, fh)
def extract_address(ext, prefix, query): if query is None: return {} data = { prefix + '_official_name': ext.text(query+'OFFICIALNAME'), prefix + '_address': ext.text(query+'ADDRESS'), prefix + '_town': ext.text(query+'TOWN'), prefix + '_postal_code': ext.text(query+'POSTAL_CODE'), prefix + '_country': ext.attr(query+'COUNTRY', 'VALUE'), prefix + '_attention': ext.text(query+'ATTENTION'), prefix + '_phone': ext.text(query+'PHONE'), prefix + '_email': ext.text(query+'EMAIL') or ext.text(query+'E_MAIL'), prefix + '_fax': ext.text(query+'FAX'), prefix + '_url': ext.text(query+'URL_GENERAL') or ext.text(query+'URL'), prefix + '_url_buyer': ext.text(query+'URL_BUYER'), prefix + '_url_info': ext.text(query+'URL_INFORMATION'), prefix + '_url_participate': ext.text(query+'URL_PARTICIPATE') } if data[prefix + '_official_name'] is not None: data[prefix + '_slug'] = slugify(data[prefix + '_official_name']) for k, v in data.items(): if v is None: del data[k] return data
def store_layer_to_geojson(data, layer, features): """Store the returned data as a GeoJSON file.""" # skip if we're not loading geometries: if QUERY['returnGeometry'] != 'true': return out = { "type": "FeatureCollection", "features": [] } for fdata in features: attrs = {} for k, v in fdata.get('attributes').items(): k = k.lower().strip() attrs[k] = v if not fdata.get('geometry', {}).get('rings'): continue props = dict(attrs) props['layer'] = layer.get('name') out['features'].append({ 'type': 'Feature', 'geometry': { 'type': 'Polygon', 'coordinates': fdata.get('geometry', {}).get('rings') }, 'properties': props }) name = slugify('%s %s' % (data['name'], layer.get('name')), sep='_') name = name + '.geojson' log.info(' -> %s', name) with open(os.path.join(DATA_PATH, name), 'wb') as fh: json.dump(out, fh)
def slugify(mapping, bind, values): """ Transform all values into URL-capable slugs. """ for value in values: if isinstance(value, six.string_types): value = transliterate(value) value = normality.slugify(value) yield value
def make_filename(file_name, sep='-'): if file_name is not None: file_name = os.path.basename(file_name) slugs = [slugify(s, sep=sep) for s in file_name.rsplit('.', 1)] slugs = [s[:200] for s in slugs if s is not None] file_name = '.'.join(slugs) file_name = file_name.strip('.').strip(sep) return file_name
def add_codelist(codelist_name, codelist_data): codelist = models.Codelist() codelist.code = normality.slugify(codelist_name) codelist.name = codelist_name db.session.add(codelist) db.session.commit() for codelist_code in codelist_data: add_codelist_data(codelist_name, codelist_code) db.session.commit()
def create_fixtures(): if 'engine' not in SHARED: conn = dataset.connect('sqlite://') for table in ['companies', 'financials']: with open(os.path.join(FIXTURE_PATH, table + '.csv'), 'r') as fh: for row in unicodecsv.DictReader(fh): data = {slugify(k, sep='_'): v for k, v in row.items()} conn[table].insert(data) SHARED['engine'] = conn.engine return SHARED['engine']
def load_countries(): if len(COUNTRIES): return COUNTRIES with open(os.path.join(DATA_FIXTURES, 'countries.csv'), 'r') as fh: for row in unicodecsv.DictReader(fh): name = slugify(row['name'], sep=' ').strip() code = row['code'].strip().upper() REQUESTED.append({'name': row['name'], 'code': code}) COUNTRIES[name] = code return COUNTRIES
def load_countries(): if len(COUNTRIES): return COUNTRIES with open(os.path.join(DATA_FIXTURES, "countries.csv"), "r") as fh: for row in unicodecsv.DictReader(fh): name = slugify(row["name"], sep=" ").strip() code = row["code"].strip().upper() REQUESTED.append({"name": row["name"], "code": code}) COUNTRIES[name] = code return COUNTRIES
def convert_row(row): out = {} for field, value in row.items(): field = slugify(field, sep='_') value = value.strip() # TODO handle excel dates etc. if not len(value): continue out[field] = value return out
def add_column(self, label): column = slugify(label or "", sep="_")[:55] column = column or "column" name, i = column, 2 # de-dupe: column, column_2, column_3, ... while name in [c.name for c in self.columns]: name = "%s_%s" % (name, i) i += 1 column = {"label": label, "name": column} self.schema["columns"].append(column) return TabularColumn(self, column)
def column_alias(cell, names): """ Generate a normalized version of the column name. """ column = slugify(cell.column or '', sep='_') column = column.strip('_') column = 'column' if not len(column) else column name, i = column, 2 # de-dupe: column, column_2, column_3, ... while name in names: name = '%s_%s' % (name, i) i += 1 return name
def to_proxy(self): meta = dict(self.meta) headers = meta.pop('headers', {}) headers = {slugify(k, sep='_'): v for k, v in headers.items()} proxy = model.get_proxy({ 'id': str(self.id), 'schema': self.model, 'properties': meta }) proxy.set('contentHash', self.content_hash) proxy.set('parent', self.parent_id) proxy.set('ancestors', self.ancestors) proxy.set('processingStatus', self.status) proxy.set('processingError', self.error_message) proxy.set('fileSize', meta.get('file_size')) proxy.set('fileName', meta.get('file_name')) if not proxy.has('fileName'): disposition = headers.get('content_disposition') if disposition is not None: _, attrs = cgi.parse_header(disposition) proxy.set('fileName', attrs.get('filename')) proxy.set('mimeType', meta.get('mime_type')) if not proxy.has('mimeType'): proxy.set('mimeType', headers.get('content_type')) proxy.set('language', meta.get('languages')) proxy.set('country', meta.get('countries')) proxy.set('authoredAt', meta.get('authored_at')) proxy.set('modifiedAt', meta.get('modified_at')) proxy.set('publishedAt', meta.get('published_at')) proxy.set('retrievedAt', meta.get('retrieved_at')) proxy.set('sourceUrl', meta.get('source_url')) proxy.set('messageId', meta.get('message_id'), quiet=True) proxy.set('inReplyTo', meta.get('in_reply_to'), quiet=True) proxy.set('bodyText', self.body_text, quiet=True) proxy.set('bodyHtml', self.body_raw, quiet=True) columns = meta.get('columns') proxy.set('columns', registry.json.pack(columns), quiet=True) proxy.set('headers', registry.json.pack(headers), quiet=True) pdf = 'application/pdf' if meta.get('extension') == 'pdf' or proxy.first('mimeType') == pdf: proxy.set('pdfHash', self.content_hash, quiet=True) proxy.add('pdfHash', meta.get('pdf_version'), quiet=True) q = db.session.query(DocumentTag) q = q.filter(DocumentTag.document_id == self.id) q = q.filter(DocumentTag.type.in_(DocumentTag.MAPPING.keys())) q = q.order_by(DocumentTag.weight.desc()) q = q.limit(Document.MAX_TAGS) for tag in q.all(): prop = DocumentTag.MAPPING.get(tag.type) if prop is not None: proxy.add(prop, tag.text) return proxy
def __init__(self, context, parent, node): self.context = context self.parent = parent self.node = node self._results = [] if node.name is None: self.id = 'root' else: prefix = '_any' if node.name == '*' else node.name id = '%s_%s' % (prefix, uuid4().hex[:5]) self.id = slugify(id, '_') self.var = v[self.id]
def add_column(self, label): column = slugify(label or '', sep='_') column = column or 'column' column = column[:55] name, i = column, 2 # de-dupe: column, column_2, column_3, ... while name in [c.name for c in self.columns]: name = '%s_%s' % (name, i) i += 1 column = {'label': label, 'name': column} self.schema['columns'].append(column) return TabularColumn(self, column)
def cleanup(row): data = {} for k, v in row.items(): k = slugify(k, sep='_') if isinstance(v, basestring): v = v.strip() if not len(v): continue if v is None: continue data[k] = v return data
def get_files(data): url = data.get('issue_url') for href, a in content_links(url): d = data.copy() d['file'] = a.text_content() if href.endswith('/view'): href, _ = href.rsplit('/view', 1) if not href.endswith('.pdf'): continue d['url'] = href file_name = slugify(d['file'], sep='_') path = slugify(d['issue'], sep='_') file_name = os.path.join(DATA_PATH, 'boletin', path, file_name) try: os.makedirs(os.path.dirname(file_name)) except: pass print [file_name] if not os.path.isfile(file_name): urlretrieve(d['url'], file_name) documentcloudify(file_name, d)
def get_alias(self, schema): """ Slightly hacky way of getting a slug-like name for a schema. This is used to determine document types in the Elastic index. """ for alias, uri in self.schemas.items(): if uri == schema: return alias p = urlparse.urlparse(schema) name, _ = os.path.splitext(os.path.basename(p.path)) name = slugify(name, sep='_') if not len(name) or name in self.schemas: raise ConfigException("Cannot determine alias for: %r" % schema) self['schemas'][name] = schema return name
def make_person(engine, beitrag, fp, source_url): person = { 'fingerprint': fp, 'slug': slugify(fp, sep='-'), 'source_url': source_url, 'vorname': beitrag['vorname'], 'nachname': beitrag['nachname'], 'ort': beitrag.get('ort'), 'ressort': beitrag.get('ressort'), 'land': beitrag.get('land'), 'fraktion': beitrag.get('fraktion') } tbl_person.upsert(person, ['fingerprint']) return fp
def make_entity(self, name, type_, raw={}): slug = slugify(name) if not slug: return entity = {'name': name, 'slug': slug, 'type': type_} for k, v in raw.items(): entity['raw_%s' % k] = v key = (type_, slug) if key in self.entities: self.entities[key].update(entity) else: self.entities[key] = entity return slug
def crawl_document(self, url): try: self.check_tag(url=url) except TagExists: pass res = requests.get(url) doc = html.fromstring(res.content) data = { 'details_url': url, 'title': doc.findtext('.//div[@class="c00v3-introduction"]/h1'), 'summary': doc.findtext('.//span[@id="detail_abstract"]') or doc.findtext('.//span[@id="summary_abstract"]') } log.info("Crawling WB document: %s, %s", data['title'], url) if doc.find('.//div[@id="CitationHidDiv"]') is not None: text = clean(doc.find('.//div[@id="CitationHidDiv"]')) data['citation'] = text for li in doc.findall('.//ul[@class="detail"]/li'): label = li.findtext('./label') if label is None: continue label = slugify(label, sep='_') value = li.find('./span').xpath('string()') if value is None: continue value = value.strip().strip(';') if label == 'rel_proj_id': values = value.split(' -- ') value = values[0] if len(values) > 1: data['project_id'] = values[1] if len(value): data[label] = clean(value) for li in doc.findall('.//ul[@class="documentLnks"]/li'): record = data.copy() if li.get('class') != 'textdoc': doc_url = li.find('a').get('href') # from pprint import pprint # pprint(record) self.emit_url(doc_url, title=data['title'], summary=data['summary'], meta=record)
def render_changes(changes): if not len(changes): return None with open('notification.jinja.html', 'r') as fh: template = Template(fh.read()) layers = {} source = {} for change in changes: data = change.get('record_new') or change.get('record_old') if not len(source): source = { 'url': data.get('source_url'), 'title': data.get('source_title'), 'name': data.get('source_name') } layer_id = data.get('layer_id') if layer_id not in layers: csv_name = '%(source_name)s %(layer_name)s' % data csv_name = slugify(csv_name, sep='_') csv_name = 'http://data.pudo.org/flexicadastre/csv/%s.csv' % csv_name layers[layer_id] = { 'id': layer_id, 'title': data.get('layer_name'), 'csv': csv_name, 'changes': [] } for obj in [change.get('record_new'), change.get('record_old')]: obj.pop('layer_id', None) obj.pop('layer_name', None) obj.pop('source_title', None) obj.pop('source_name', None) obj.pop('source_url', None) headers = change.get('record_new').keys() headers.extend(change.get('record_old').keys()) change['headers'] = list(sorted(headers)) layers[layer_id]['changes'].append(change) out = template.render(source=source, layers=layers) out_name = os.path.join(DATA_PATH, 'report_%s_%s.html' % (source['name'], TODAY)) with open(out_name, 'w') as fo: fo.write(out.encode('utf-8'))
def crawl(self, directory=None, collection=None, meta={}): directory = string_value(directory) if directory is None or not os.path.exists(directory): log.error("Invalid directory: %r", directory) return directory = os.path.abspath(os.path.normpath(directory)) collection = collection or directory collection = Collection.create({ 'foreign_id': 'directory:%s' % slugify(collection), 'label': collection }) db.session.commit() meta = self.make_meta(meta) meta.source_path = directory ingest_directory(collection.id, meta, directory)
def parse_entry(emitter, entry): entity = emitter.make("LegalEntity") if entry.find("./default:subjectType", NS).get("classificationCode") == "P": entity = emitter.make("Person") reference_no = slugify(entry.get("euReferenceNumber")) entity.id = "fsf-%s" % reference_no regulation = entry.find("./default:regulation", NS) source_url = regulation.findtext("./default:publicationUrl", "", NS) entity.add("sourceUrl", source_url) sanction = emitter.make("Sanction") sanction.make_id(entity.id) sanction.add("entity", entity) sanction.add("authority", "European Union") sanction.add("sourceUrl", source_url) program = jointext( regulation.get("programme"), regulation.get("numberTitle"), sep=" - ", ) sanction.add("program", program) sanction.add("reason", entry.findtext("./default:remark", "", NS)) sanction.add("startDate", regulation.get("entryIntoForceDate")) for name in entry.findall("./default:nameAlias", NS): if entity.has("name"): entity.add("alias", name.get("wholeName")) else: entity.add("name", name.get("wholeName")) entity.add("title", name.get("title"), quiet=True) entity.add("firstName", name.get("firstName"), quiet=True) entity.add("middleName", name.get("middleName"), quiet=True) entity.add("lastName", name.get("lastName"), quiet=True) entity.add("position", name.get("function"), quiet=True) gender = GENDERS.get(name.get("gender")) entity.add("gender", gender, quiet=True) # TODO: support other types of ID for pnode in entry.findall( './default:identification[@identificationTypeCode="passport"]', NS): passport = emitter.make("Passport") passport.make_id("Passport", entity.id, pnode.get("logicalId")) passport.add("holder", entity) passport.add("passportNumber", pnode.get("number")) passport.add("country", pnode.get("countryIso2Code")) emitter.emit(passport) for node in entry.findall("./default:address", NS): address = jointext( node.get("street"), node.get("city"), node.findtext("default:zipCode", "", NS), ) entity.add("address", address) entity.add("country", node.get("countryIso2Code")) for birth in entry.findall("./default:birthdate", NS): entity.add("birthDate", birth.get("birthdate")) entity.add("birthPlace", birth.get("city")) for country in entry.findall("./default:citizenship", NS): entity.add("nationality", country.get("countryIso2Code"), quiet=True) emitter.emit(entity) emitter.emit(sanction)
for outdir, url in SECTION.items(): res = requests.get(url) sio = StringIO(res.content) # with open(os.path.join(DATA, '%s.csv' % section), 'r') as fh: for row in unicodecsv.DictReader(sio): data = {} for k, v in row.items(): v = v.strip() if not len(v): continue elif v.lower() == 'n/a': continue k = k.replace("'", '') nk = slugify(k, sep='_') nk = RENAMES.get(nk, nk) if nk in data: raise ValueError(nk, k) if v.lower() == 'yes': v = True elif v.lower() == 'no': v = False if nk in SPLIT_FIELDS: v = [x.strip() for x in v.split(',') if len(x.strip())] data[nk] = v urlslug = slugify(data.get('name'), sep='-') if urlslug is None or not len(urlslug): print[data.get('name'), urlslug] continue fileslug = slugify(data.get('name'), sep='_')
def convert_snakecase(name): name = titlecase(name) return slugify(name, sep='_')
def test_petro(self): text = u"Порошенко Петро Олексійович" self.assertEqual("porosenko-petro-oleksijovic", slugify(text)) self.assertEqual("Porosenko Petro Oleksijovic", ascii_text(text)) self.assertEqual(u"Porošenko Petro Oleksíjovič", latinize_text(text)) self.assertEqual(u"порошенко петро олексіиович", normalize(text))
def node_id(self, value: str) -> str: return "addr:%s" % slugify(value)
def slugify(mapping, bind, values): """ Transform all values into URL-capable slugs. """ return [normality.slugify(v) for v in values]
def extension(self): name, ext = os.path.splitext(self.real_path) if len(ext): return slugify(ext, '')
# coding: utf-8 from normality import normalize, latinize_text, ascii_text, slugify SAMPLES = [ u'Порошенко Петро Олексійович', u'FUAD ALIYEV ƏHMƏD OĞLU', u'Häschen Spaß', u'ავლაბრის ფონდი', ] for sample in SAMPLES: print 'SAMPLE :', sample print ' NORM :', normalize(sample) print ' SLUG :', slugify(sample) print ' LATIN:', latinize_text(sample) print ' ASCII:', ascii_text(sample)
def normalizeDBcol(col): return slugify(col).replace('-', '_')
def headers(self): # normalize header names headers = {} for k, v in self.data.get('headers', {}).items(): headers[slugify(k, sep='_')] = v return headers
def node_id(self, value): return "name:%s" % slugify(value)
def make_id(*parts): parts = [unicode(p) for p in parts if p is not None] parts = [slugify(p, sep='-') for p in parts if len(p)] return ':'.join(parts)
def add_codelist_data(codelist_name, codelist_code): codelistcode = models.CodelistCode() codelistcode.code = codelist_code["code"] codelistcode.name = codelist_code["name"] codelistcode.codelist_code = normality.slugify(codelist_name) db.session.add(codelistcode)
def slug(name): return slugify(name, sep='_')
def headers(self): raw = self.meta.get('headers', {}) return {slugify(k, sep='_'): v for k, v in raw.items()}
def crawl_person(context, name, url): # context.log.info("Crawling member", name=name, url=url) res = context.http.get(url) doc = html.fromstring(res.text) _, person_id = url.rsplit("/", 1) person = context.make("Person") person.id = f"eu-cor-{person_id}" person.add("sourceUrl", url) person.add("name", name) person.add("topics", "role.pep") last_name, first_name = name.split(", ", 1) person.add("firstName", first_name) person.add("lastName", last_name) address = {} details = doc.find('.//div[@class="regular-details"]') for row in details.findall('.//ul[@class="no-bullet"]/li'): children = row.getchildren() title = children[0] title_text = collapse_spaces(stringify(title.text_content())) title_text = title_text or title.get("class") value = collapse_spaces(title.tail) if title_text in ("Full name:", "Address:", "Declaration of interests"): # ignore these. continue if title_text == "Emails:": emails = [e.text for e in row.findall(".//a")] person.add("email", emails) continue if "glyphicon-phone" in title_text: person.add("phone", value.split(",")) continue if "fa-fax" in title_text: # TODO: yeah, no # person.add("phone", value) continue if title_text in ("Web sites:", "list-inline"): sites = [e.get("href") for e in row.findall(".//a")] person.add("website", sites) continue if title_text == "Represented Country:": person.add("country", value) continue if title_text == "Languages:": # TODO: missing in FtM # person.add("languages", value.split(',')) continue if "Regions since:" in title_text: date = parse_date(value) person.context["created_at"] = date continue if "Date of birth:" in title_text: person.add("birthDate", parse_date(value)) continue if "Commissions:" in title_text: for com in row.findall(".//li"): text = collapse_spaces(com.text_content()) sep = "Mandate - " if sep in text: _, text = text.split(sep, 1) person.add("sector", text) continue if "Areas of interest:" in title_text: for area in row.findall(".//li"): person.add("keywords", area.text_content()) continue if title.tag == "i" and value is None: person.add("position", title_text) continue if title_text in ("Country:"): person.add("country", value) if title_text in ("Street:", "Postal code:", "City:", "Country:"): address[title_text.replace(":", "")] = value continue if title_text == "Political group:": group = context.make("Organization") group.add("name", value) slug = value if "(" in slug: _, slug = slug.rsplit("(", 1) slug = slugify(slug, sep="-") group.id = f"eu-cor-group-{slug}" context.emit(group) member = context.make("Membership") member.make_id("Membership", person.id, group.id) member.add("member", person) member.add("organization", group) context.emit(member) continue address = ( address.get("Street"), address.get("City"), address.get("Posal code"), address.get("Country"), ) address = ", ".join([a for a in address if a is not None]) person.add("address", address) context.emit(person)
def headers(self): # normalize header names raw = self._headers or {} return {slugify(k, sep='_'): v for k, v in raw.items()}
def create(cls, name, *keys): keys = [slugify(k, sep='-') for k in keys] entity_id = '-'.join([k for k in keys if k is not None]) entity_id = '%s.%s' % (name, entity_id) entity = Entity(source=name, id=entity_id) return entity
def node_id(self, value): return 'name:%s' % slugify(value)
def node_id(self, value): return 'addr:%s' % slugify(value)
def display_slug(self): return slugify(self.display_label, sep=' ')
def test_slugify(self): text = u'BABY! camel-is good' self.assertEqual('baby-camel-is-good', slugify(text, sep='-'))
def test_german(self): text = u"Häschen Spaß" self.assertEqual("Haschen Spass", ascii_text(text)) self.assertEqual("haschen-spass", slugify(text, sep="-"))
def node_id(self, value: str) -> Optional[str]: slug = slugify(value) if slug is None: return None return f"name:{slug}"
def node_id(self, value: str) -> Optional[str]: slug = slugify(value) if slug is None: return None return f"addr:{value}"
def test_petro(self): text = u'Порошенко Петро Олексійович' self.assertEqual('porosenko-petro-oleksijovic', slugify(text)) self.assertEqual('Porosenko Petro Oleksijovic', ascii_text(text)) self.assertEqual(u'Porošenko Petro Oleksíjovič', latinize_text(text)) self.assertEqual(u'порошенко петро олексіиович', normalize(text))
def convert_snakecase(name): if name.upper() != name: name = titlecase(name) return slugify(name, sep='_')
def __init__(self, matcher, out_dir, file_name): self.file_name = decode_path(file_name) base_name = slugify(file_name, sep='_') super(CsvTableSearcher, self).__init__(matcher, out_dir, base_name)
import os from normality import slugify PROJECT_NAME = 'Follow the Money' PROJECT_NAME = os.environ.get('INTEGRATE_PROJECT_NAME', PROJECT_NAME) DATABASE_URI = 'sqlite:///integrate.sqlite3' DATABASE_URI = os.environ.get('INTEGRATE_DATABASE_URI', DATABASE_URI) DATABASE_PREFIX = os.environ.get('INTEGRATE_DATABASE_PREFIX', 'ftm') DATABASE_PREFIX = slugify(DATABASE_PREFIX, sep='_') QUORUM = int(os.environ.get('INTEGRATE_QUORUM', 1))
def url(self): study_url = self.base + '/' + slugify(self.title) if self.area is None or type(self.area) is list: return study_url else: return study_url + '/' + self.area.lower()