Ejemplo n.º 1
0
def parse_file(path):
    with open(path, 'rb') as fh:
        ctx = json.load(fh)

    if ctx['source_name'] not in ['MZ']:
        return

    all_name = slugify('%(source_name)s flexicadastre' % ctx, sep='_')
    all_tbl = database[all_name]
    all_tbl.delete()

    layers = ctx.pop('layers')
    for layer in layers:
        lctx = ctx.copy()
        lctx['layer_name'] = layer['name']
        lctx['layer_id'] = layer['id']
        del lctx['rest_url']

        tbl_name = slugify('%(source_name)s %(layer_name)s' % lctx, sep='_')
        tbl = database[tbl_name]
        tbl.delete()

        features = layer['data']['features']
        print ' -> Generating:', tbl_name
        print '    ', layer['name'], layer['id'], len(features)

        for feature in features:
            attrs = convrow(feature.get('attributes'))
            attrs.update(lctx)
            tbl.insert(attrs)
            all_tbl.insert(attrs)
Ejemplo n.º 2
0
    def scrape_company(self, data):
        if self.COMPANIES_SCRAPED < self.COMPANY_OFFSET:
            self.COMPANIES_SCRAPED += 1
            logging.debug('skipping %s' % data.get('code', 'unknown'))
            return
        if self.COMPANIES_SCRAPED > self.MAX_COMPANIES + self.COMPANY_OFFSET:
            logging.info('finished companies at no. %s' % self.COMPANIES_SCRAPED)
            return
        self.COMPANIES_SCRAPED += 1
        logging.info('scraping %s' % data)
        url = API_URL % data.get('ASX code')
        data.update(requests.get(url).json())
        if 'code' not in data:
            return
        data['Stock Info URL'] = url
        data.pop('ASX code', None)
        data.pop('primary_share', None)
        data.pop('last_dividend', None)
        data.pop('latest_annual_reports', None)
        data.pop('products', None)

        record = {}
        for k, v in data.items():
            record[slugify(k, sep='_')] = v

        category = slugify(record['gics_industry_group'])
        if category not in ['materials', 'energy']:
            logging.info('skipping category %s' % category)
            return

        self.scrape_announcements(data)
Ejemplo n.º 3
0
def parse_file(path):
    with open(path, 'rb') as fh:
        ctx = json.load(fh)

    #if ctx['source_name'] not in ['MZ']:
    #    return

    layers = ctx.pop('layers')
    for layer in layers:
        lctx = ctx.copy()
        lctx['layer_name'] = layer['name']
        lctx['layer_id'] = layer['id']
        lctx.pop('rest_url', None)

        tbl_name = slugify('%(source_name)s %(layer_name)s' % lctx, sep='_')
        tbl = database[tbl_name]
        tbl.delete()

        features = layer['data']['features']
        print ' -> Generating:', tbl_name
        print '    ', layer['name'], layer['id'], len(features)

        for feature in features:
            attrs = convrow(feature.get('attributes'))
            attrs.update(lctx)
            tbl.insert(attrs)

        dataset.freeze(tbl, prefix=DEST_PATH, filename='%s.csv' % tbl_name, format='csv')
Ejemplo n.º 4
0
def store_layer_to_db(data, layer, features):
    """Load a layer of features into a database table."""
    # table names are generated from the name of the layer and
    # the name of the country.
    tbl_name = '%s %s' % (data['name'], layer['name'])
    tbl_name = slugify(tbl_name, sep='_')
    log.info('    -> %s: %s rows', tbl_name, len(features))
    tbl = database[tbl_name]
    # clear out all existing data.
    tbl.delete()
    rows = []
    types = {}
    for feature in features:
        row = convrow(feature['attributes'])
        for k, v in row.items():
            if isinstance(v, (int, long)):
                types[k] = BigInteger
        row['layer_name'] = layer['name']
        row['layer_id'] = layer['id']
        row['source_name'] = data['name']
        row['source_title'] = data['title']
        row['source_url'] = data['url']
        if QUERY['returnGeometry'] == 'true':
            # store the geometry as JSON. not sure this is a
            # great idea because it may make the resulting
            # CSV files really hard to parse.
            row['_geometry'] = json.dumps(feature['geometry'])
            row['_attributes'] = json.dumps(feature['attributes'])
        rows.append(row)
    tbl.insert_many(rows, types=types)

    # Dump the table to a CSV file
    csv_file = '%s.csv' % tbl_name
    log.info('    -> %s', csv_file)
    dataset.freeze(tbl, prefix=DATA_PATH, filename=csv_file, format='csv')
Ejemplo n.º 5
0
    def crawl(self, directory=None, collection=None, meta={}):
        collection = collection or directory
        collection = Collection.create({
            'foreign_id': 'directory:%s' % slugify(collection),
            'label': collection
        })
        db.session.commit()
        collection_id = collection.id

        if os.path.isfile(directory):
            self.crawl_file(collection_id, directory, meta)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            dirparts = [d for d in dirname.split(os.path.sep)
                        if d in SKIP_DIRECTORIES]
            if len(dirparts):
                continue
            log.info("Descending: %r", dirname)
            for file_name in files:
                dirname = string_value(dirname)
                file_name = string_value(file_name)
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                self.crawl_file(collection_id, file_path, meta)
Ejemplo n.º 6
0
def make_filename(source, sep='-'):
    if source is not None:
        source = os.path.basename(source)
        slugs = [slugify(s, sep=sep) for s in source.split('.')]
        source = '.'.join(slugs)
        source = source.strip('.').strip(sep)
    return source
Ejemplo n.º 7
0
    def by_country(self, country):
        country_slug = slugify(country)
        for rig_data in self.rigs_data:
            if slugify(rig_data.get('country')) != country_slug:
                continue
            rig_slug = self.make_entity(rig_data['name'], 'rig', raw=rig_data)
            rig = self.entities[('rig', rig_slug)]

            for role in ['owner', 'operator', 'manager']:
                rig[role] = self.make_entity(rig_data.get(role), 'company')

            rig['flag'] = self.make_entity(rig_data.get('flag'), 'rflag')
            rig['location'] = self.make_entity(rig_data.get('country'),
                                               'location')

        return {'entities': self.entities.values()}
Ejemplo n.º 8
0
def parse_file(path):
    with open(path, 'rb') as fh:
        ctx = json.load(fh)

    if ctx['source_name'] not in ['TZ']:
        return

    for layer in ctx.get('layers'):
        out = {
            "type": "FeatureCollection",
            "features": []
        }

        for fdata in layer.pop('data').get('features'):
            attrs = get_attrs(fdata)
            if not fdata.get('geometry', {}).get('rings'):
                continue

            props = dict(attrs)
            props['layer'] = layer.get('name')
            feature = {
                'type': 'Feature',
                'geometry': {
                    'type': 'Polygon',
                    'coordinates': fdata.get('geometry', {}).get('rings')
                },
                'properties': props
            }
            out['features'].append(feature)

        name = slugify('%s %s' % (ctx['source_name'], layer.get('name')),
                       sep='_')
        name = name + '.json'
        with open(os.path.join(DEST_PATH, name), 'wb') as fh:
            json.dump(out, fh)
Ejemplo n.º 9
0
def extract_address(ext, prefix, query):
    if query is None:
        return {}
    data = {
        prefix + '_official_name': ext.text(query+'OFFICIALNAME'),
        prefix + '_address': ext.text(query+'ADDRESS'),
        prefix + '_town': ext.text(query+'TOWN'),
        prefix + '_postal_code': ext.text(query+'POSTAL_CODE'),
        prefix + '_country': ext.attr(query+'COUNTRY', 'VALUE'),
        prefix + '_attention': ext.text(query+'ATTENTION'),
        prefix + '_phone': ext.text(query+'PHONE'),
        prefix + '_email': ext.text(query+'EMAIL') or ext.text(query+'E_MAIL'),
        prefix + '_fax': ext.text(query+'FAX'),
        prefix + '_url': ext.text(query+'URL_GENERAL') or ext.text(query+'URL'),
        prefix + '_url_buyer': ext.text(query+'URL_BUYER'),
        prefix + '_url_info': ext.text(query+'URL_INFORMATION'),
        prefix + '_url_participate': ext.text(query+'URL_PARTICIPATE')
    }

    if data[prefix + '_official_name'] is not None:
        data[prefix + '_slug'] = slugify(data[prefix + '_official_name'])

    for k, v in data.items():
        if v is None:
            del data[k]
    return data
Ejemplo n.º 10
0
def store_layer_to_geojson(data, layer, features):
    """Store the returned data as a GeoJSON file."""
    # skip if we're not loading geometries:
    if QUERY['returnGeometry'] != 'true':
        return

    out = {
        "type": "FeatureCollection",
        "features": []
    }
    for fdata in features:
        attrs = {}
        for k, v in fdata.get('attributes').items():
            k = k.lower().strip()
            attrs[k] = v

        if not fdata.get('geometry', {}).get('rings'):
            continue

        props = dict(attrs)
        props['layer'] = layer.get('name')
        out['features'].append({
            'type': 'Feature',
            'geometry': {
                'type': 'Polygon',
                'coordinates': fdata.get('geometry', {}).get('rings')
            },
            'properties': props
        })

    name = slugify('%s %s' % (data['name'], layer.get('name')), sep='_')
    name = name + '.geojson'
    log.info('    -> %s', name)
    with open(os.path.join(DATA_PATH, name), 'wb') as fh:
        json.dump(out, fh)
Ejemplo n.º 11
0
def slugify(mapping, bind, values):
    """ Transform all values into URL-capable slugs. """
    for value in values:
        if isinstance(value, six.string_types):
            value = transliterate(value)
            value = normality.slugify(value)
        yield value
Ejemplo n.º 12
0
def make_filename(file_name, sep='-'):
    if file_name is not None:
        file_name = os.path.basename(file_name)
        slugs = [slugify(s, sep=sep) for s in file_name.rsplit('.', 1)]
        slugs = [s[:200] for s in slugs if s is not None]
        file_name = '.'.join(slugs)
        file_name = file_name.strip('.').strip(sep)
    return file_name
Ejemplo n.º 13
0
 def add_codelist(codelist_name, codelist_data):
     codelist = models.Codelist()
     codelist.code = normality.slugify(codelist_name)
     codelist.name = codelist_name
     db.session.add(codelist)
     db.session.commit()
     
     for codelist_code in codelist_data:
         add_codelist_data(codelist_name, codelist_code)
     db.session.commit()
Ejemplo n.º 14
0
Archivo: util.py Proyecto: occrp/loom
def create_fixtures():
    if 'engine' not in SHARED:
        conn = dataset.connect('sqlite://')
        for table in ['companies', 'financials']:
            with open(os.path.join(FIXTURE_PATH, table + '.csv'), 'r') as fh:
                for row in unicodecsv.DictReader(fh):
                    data = {slugify(k, sep='_'): v for k, v in row.items()}
                    conn[table].insert(data)
        SHARED['engine'] = conn.engine
    return SHARED['engine']
Ejemplo n.º 15
0
def load_countries():
    if len(COUNTRIES):
        return COUNTRIES
    with open(os.path.join(DATA_FIXTURES, 'countries.csv'), 'r') as fh:
        for row in unicodecsv.DictReader(fh):
            name = slugify(row['name'], sep=' ').strip()
            code = row['code'].strip().upper()
            REQUESTED.append({'name': row['name'], 'code': code})
            COUNTRIES[name] = code
    return COUNTRIES
Ejemplo n.º 16
0
def load_countries():
    if len(COUNTRIES):
        return COUNTRIES
    with open(os.path.join(DATA_FIXTURES, "countries.csv"), "r") as fh:
        for row in unicodecsv.DictReader(fh):
            name = slugify(row["name"], sep=" ").strip()
            code = row["code"].strip().upper()
            REQUESTED.append({"name": row["name"], "code": code})
            COUNTRIES[name] = code
    return COUNTRIES
Ejemplo n.º 17
0
def convert_row(row):
    out = {}
    for field, value in row.items():
        field = slugify(field, sep='_')
        value = value.strip()
        # TODO handle excel dates etc.
        if not len(value):
            continue
        out[field] = value
    return out
Ejemplo n.º 18
0
 def add_column(self, label):
     column = slugify(label or "", sep="_")[:55]
     column = column or "column"
     name, i = column, 2
     # de-dupe: column, column_2, column_3, ...
     while name in [c.name for c in self.columns]:
         name = "%s_%s" % (name, i)
         i += 1
     column = {"label": label, "name": column}
     self.schema["columns"].append(column)
     return TabularColumn(self, column)
Ejemplo n.º 19
0
def column_alias(cell, names):
    """ Generate a normalized version of the column name. """
    column = slugify(cell.column or '', sep='_')
    column = column.strip('_')
    column = 'column' if not len(column) else column
    name, i = column, 2
    # de-dupe: column, column_2, column_3, ...
    while name in names:
        name = '%s_%s' % (name, i)
        i += 1
    return name
Ejemplo n.º 20
0
    def to_proxy(self):
        meta = dict(self.meta)
        headers = meta.pop('headers', {})
        headers = {slugify(k, sep='_'): v for k, v in headers.items()}
        proxy = model.get_proxy({
            'id': str(self.id),
            'schema': self.model,
            'properties': meta
        })
        proxy.set('contentHash', self.content_hash)
        proxy.set('parent', self.parent_id)
        proxy.set('ancestors', self.ancestors)
        proxy.set('processingStatus', self.status)
        proxy.set('processingError', self.error_message)
        proxy.set('fileSize', meta.get('file_size'))
        proxy.set('fileName', meta.get('file_name'))
        if not proxy.has('fileName'):
            disposition = headers.get('content_disposition')
            if disposition is not None:
                _, attrs = cgi.parse_header(disposition)
                proxy.set('fileName', attrs.get('filename'))
        proxy.set('mimeType', meta.get('mime_type'))
        if not proxy.has('mimeType'):
            proxy.set('mimeType', headers.get('content_type'))
        proxy.set('language', meta.get('languages'))
        proxy.set('country', meta.get('countries'))
        proxy.set('authoredAt', meta.get('authored_at'))
        proxy.set('modifiedAt', meta.get('modified_at'))
        proxy.set('publishedAt', meta.get('published_at'))
        proxy.set('retrievedAt', meta.get('retrieved_at'))
        proxy.set('sourceUrl', meta.get('source_url'))
        proxy.set('messageId', meta.get('message_id'), quiet=True)
        proxy.set('inReplyTo', meta.get('in_reply_to'), quiet=True)
        proxy.set('bodyText', self.body_text, quiet=True)
        proxy.set('bodyHtml', self.body_raw, quiet=True)
        columns = meta.get('columns')
        proxy.set('columns', registry.json.pack(columns), quiet=True)
        proxy.set('headers', registry.json.pack(headers), quiet=True)

        pdf = 'application/pdf'
        if meta.get('extension') == 'pdf' or proxy.first('mimeType') == pdf:
            proxy.set('pdfHash', self.content_hash, quiet=True)
        proxy.add('pdfHash', meta.get('pdf_version'), quiet=True)

        q = db.session.query(DocumentTag)
        q = q.filter(DocumentTag.document_id == self.id)
        q = q.filter(DocumentTag.type.in_(DocumentTag.MAPPING.keys()))
        q = q.order_by(DocumentTag.weight.desc())
        q = q.limit(Document.MAX_TAGS)
        for tag in q.all():
            prop = DocumentTag.MAPPING.get(tag.type)
            if prop is not None:
                proxy.add(prop, tag.text)
        return proxy
Ejemplo n.º 21
0
 def __init__(self, context, parent, node):
     self.context = context
     self.parent = parent
     self.node = node
     self._results = []
     if node.name is None:
         self.id = 'root'
     else:
         prefix = '_any' if node.name == '*' else node.name
         id = '%s_%s' % (prefix, uuid4().hex[:5])
         self.id = slugify(id, '_')
     self.var = v[self.id]
Ejemplo n.º 22
0
Archivo: tabular.py Proyecto: 01-/aleph
 def add_column(self, label):
     column = slugify(label or '', sep='_')
     column = column or 'column'
     column = column[:55]
     name, i = column, 2
     # de-dupe: column, column_2, column_3, ...
     while name in [c.name for c in self.columns]:
         name = '%s_%s' % (name, i)
         i += 1
     column = {'label': label, 'name': column}
     self.schema['columns'].append(column)
     return TabularColumn(self, column)
Ejemplo n.º 23
0
def cleanup(row):
    data = {}
    for k, v in row.items():
        k = slugify(k, sep='_')
        if isinstance(v, basestring):
            v = v.strip()
            if not len(v):
                continue
        if v is None:
            continue
        data[k] = v
    return data
Ejemplo n.º 24
0
def get_files(data):
    url = data.get('issue_url')
    for href, a in content_links(url):
        d = data.copy()
        d['file'] = a.text_content()
        if href.endswith('/view'):
            href, _ = href.rsplit('/view', 1)
        if not href.endswith('.pdf'):
            continue
        d['url'] = href
        file_name = slugify(d['file'], sep='_')
        path = slugify(d['issue'], sep='_')
        file_name = os.path.join(DATA_PATH, 'boletin', path, file_name)
        try:
            os.makedirs(os.path.dirname(file_name))
        except:
            pass
        print [file_name]
        if not os.path.isfile(file_name):
            urlretrieve(d['url'], file_name)
        documentcloudify(file_name, d)
Ejemplo n.º 25
0
Archivo: config.py Proyecto: occrp/loom
 def get_alias(self, schema):
     """ Slightly hacky way of getting a slug-like name for a schema. This
     is used to determine document types in the Elastic index. """
     for alias, uri in self.schemas.items():
         if uri == schema:
             return alias
     p = urlparse.urlparse(schema)
     name, _ = os.path.splitext(os.path.basename(p.path))
     name = slugify(name, sep='_')
     if not len(name) or name in self.schemas:
         raise ConfigException("Cannot determine alias for: %r" % schema)
     self['schemas'][name] = schema
     return name
Ejemplo n.º 26
0
def make_person(engine, beitrag, fp, source_url):
    person = {
        'fingerprint': fp,
        'slug': slugify(fp, sep='-'),
        'source_url': source_url,
        'vorname': beitrag['vorname'],
        'nachname': beitrag['nachname'],
        'ort': beitrag.get('ort'),
        'ressort': beitrag.get('ressort'),
        'land': beitrag.get('land'),
        'fraktion': beitrag.get('fraktion')
    }
    tbl_person.upsert(person, ['fingerprint'])
    return fp
Ejemplo n.º 27
0
    def make_entity(self, name, type_, raw={}):
        slug = slugify(name)
        if not slug:
            return
        entity = {'name': name, 'slug': slug, 'type': type_}

        for k, v in raw.items():
            entity['raw_%s' % k] = v
        key = (type_, slug)
        if key in self.entities:
            self.entities[key].update(entity)
        else:
            self.entities[key] = entity
        return slug
Ejemplo n.º 28
0
    def crawl_document(self, url):
        try:
            self.check_tag(url=url)
        except TagExists:
            pass

        res = requests.get(url)
        doc = html.fromstring(res.content)
        data = {
            'details_url': url,
            'title': doc.findtext('.//div[@class="c00v3-introduction"]/h1'),
            'summary': doc.findtext('.//span[@id="detail_abstract"]') or
            doc.findtext('.//span[@id="summary_abstract"]')
        }

        log.info("Crawling WB document: %s, %s", data['title'], url)

        if doc.find('.//div[@id="CitationHidDiv"]') is not None:
            text = clean(doc.find('.//div[@id="CitationHidDiv"]'))
            data['citation'] = text

        for li in doc.findall('.//ul[@class="detail"]/li'):
            label = li.findtext('./label')
            if label is None:
                continue
            label = slugify(label, sep='_')
            value = li.find('./span').xpath('string()')
            if value is None:
                continue
            value = value.strip().strip(';')

            if label == 'rel_proj_id':
                values = value.split(' -- ')
                value = values[0]
                if len(values) > 1:
                    data['project_id'] = values[1]

            if len(value):
                data[label] = clean(value)

        for li in doc.findall('.//ul[@class="documentLnks"]/li'):
            record = data.copy()
            if li.get('class') != 'textdoc':
                doc_url = li.find('a').get('href')
                # from pprint import pprint
                # pprint(record)
                self.emit_url(doc_url, title=data['title'],
                              summary=data['summary'],
                              meta=record)
Ejemplo n.º 29
0
def render_changes(changes):
    if not len(changes):
        return None

    with open('notification.jinja.html', 'r') as fh:
        template = Template(fh.read())

    layers = {}
    source = {}
    for change in changes:
        data = change.get('record_new') or change.get('record_old')

        if not len(source):
            source = {
                'url': data.get('source_url'),
                'title': data.get('source_title'),
                'name': data.get('source_name')
            }

        layer_id = data.get('layer_id')
        if layer_id not in layers:
            csv_name = '%(source_name)s %(layer_name)s' % data
            csv_name = slugify(csv_name, sep='_')
            csv_name = 'http://data.pudo.org/flexicadastre/csv/%s.csv' % csv_name

            layers[layer_id] = {
                'id': layer_id,
                'title': data.get('layer_name'),
                'csv': csv_name,
                'changes': []
            }

        for obj in [change.get('record_new'), change.get('record_old')]:
            obj.pop('layer_id', None)
            obj.pop('layer_name', None)
            obj.pop('source_title', None)
            obj.pop('source_name', None)
            obj.pop('source_url', None)

        headers = change.get('record_new').keys()
        headers.extend(change.get('record_old').keys())
        change['headers'] = list(sorted(headers))

        layers[layer_id]['changes'].append(change)

    out = template.render(source=source, layers=layers)
    out_name = os.path.join(DATA_PATH, 'report_%s_%s.html' % (source['name'], TODAY))
    with open(out_name, 'w') as fo:
        fo.write(out.encode('utf-8'))
Ejemplo n.º 30
0
 def crawl(self, directory=None, collection=None, meta={}):
     directory = string_value(directory)
     if directory is None or not os.path.exists(directory):
         log.error("Invalid directory: %r", directory)
         return
     directory = os.path.abspath(os.path.normpath(directory))
     collection = collection or directory
     collection = Collection.create({
         'foreign_id': 'directory:%s' % slugify(collection),
         'label': collection
     })
     db.session.commit()
     meta = self.make_meta(meta)
     meta.source_path = directory
     ingest_directory(collection.id, meta, directory)
Ejemplo n.º 31
0
def parse_entry(emitter, entry):
    entity = emitter.make("LegalEntity")
    if entry.find("./default:subjectType",
                  NS).get("classificationCode") == "P":
        entity = emitter.make("Person")
    reference_no = slugify(entry.get("euReferenceNumber"))
    entity.id = "fsf-%s" % reference_no

    regulation = entry.find("./default:regulation", NS)
    source_url = regulation.findtext("./default:publicationUrl", "", NS)
    entity.add("sourceUrl", source_url)

    sanction = emitter.make("Sanction")
    sanction.make_id(entity.id)
    sanction.add("entity", entity)
    sanction.add("authority", "European Union")
    sanction.add("sourceUrl", source_url)
    program = jointext(
        regulation.get("programme"),
        regulation.get("numberTitle"),
        sep=" - ",
    )
    sanction.add("program", program)
    sanction.add("reason", entry.findtext("./default:remark", "", NS))
    sanction.add("startDate", regulation.get("entryIntoForceDate"))

    for name in entry.findall("./default:nameAlias", NS):
        if entity.has("name"):
            entity.add("alias", name.get("wholeName"))
        else:
            entity.add("name", name.get("wholeName"))
        entity.add("title", name.get("title"), quiet=True)
        entity.add("firstName", name.get("firstName"), quiet=True)
        entity.add("middleName", name.get("middleName"), quiet=True)
        entity.add("lastName", name.get("lastName"), quiet=True)
        entity.add("position", name.get("function"), quiet=True)
        gender = GENDERS.get(name.get("gender"))
        entity.add("gender", gender, quiet=True)

    # TODO: support other types of ID
    for pnode in entry.findall(
            './default:identification[@identificationTypeCode="passport"]',
            NS):
        passport = emitter.make("Passport")
        passport.make_id("Passport", entity.id, pnode.get("logicalId"))
        passport.add("holder", entity)
        passport.add("passportNumber", pnode.get("number"))
        passport.add("country", pnode.get("countryIso2Code"))
        emitter.emit(passport)

    for node in entry.findall("./default:address", NS):
        address = jointext(
            node.get("street"),
            node.get("city"),
            node.findtext("default:zipCode", "", NS),
        )
        entity.add("address", address)
        entity.add("country", node.get("countryIso2Code"))

    for birth in entry.findall("./default:birthdate", NS):
        entity.add("birthDate", birth.get("birthdate"))
        entity.add("birthPlace", birth.get("city"))

    for country in entry.findall("./default:citizenship", NS):
        entity.add("nationality", country.get("countryIso2Code"), quiet=True)

    emitter.emit(entity)
    emitter.emit(sanction)
Ejemplo n.º 32
0

for outdir, url in SECTION.items():
    res = requests.get(url)
    sio = StringIO(res.content)
    # with open(os.path.join(DATA, '%s.csv' % section), 'r') as fh:
    for row in unicodecsv.DictReader(sio):
        data = {}
        for k, v in row.items():
            v = v.strip()
            if not len(v):
                continue
            elif v.lower() == 'n/a':
                continue
            k = k.replace("'", '')
            nk = slugify(k, sep='_')
            nk = RENAMES.get(nk, nk)
            if nk in data:
                raise ValueError(nk, k)
            if v.lower() == 'yes':
                v = True
            elif v.lower() == 'no':
                v = False
            if nk in SPLIT_FIELDS:
                v = [x.strip() for x in v.split(',') if len(x.strip())]
            data[nk] = v
        urlslug = slugify(data.get('name'), sep='-')
        if urlslug is None or not len(urlslug):
            print[data.get('name'), urlslug]
            continue
        fileslug = slugify(data.get('name'), sep='_')
Ejemplo n.º 33
0
def convert_snakecase(name):
    name = titlecase(name)
    return slugify(name, sep='_')
Ejemplo n.º 34
0
 def test_petro(self):
     text = u"Порошенко Петро Олексійович"
     self.assertEqual("porosenko-petro-oleksijovic", slugify(text))
     self.assertEqual("Porosenko Petro Oleksijovic", ascii_text(text))
     self.assertEqual(u"Porošenko Petro Oleksíjovič", latinize_text(text))
     self.assertEqual(u"порошенко петро олексіиович", normalize(text))
Ejemplo n.º 35
0
 def node_id(self, value: str) -> str:
     return "addr:%s" % slugify(value)
Ejemplo n.º 36
0
def slugify(mapping, bind, values):
    """ Transform all values into URL-capable slugs. """
    return [normality.slugify(v) for v in values]
Ejemplo n.º 37
0
 def extension(self):
     name, ext = os.path.splitext(self.real_path)
     if len(ext):
         return slugify(ext, '')
Ejemplo n.º 38
0
# coding: utf-8
from normality import normalize, latinize_text, ascii_text, slugify

SAMPLES = [
    u'Порошенко Петро Олексійович',
    u'FUAD ALIYEV ƏHMƏD OĞLU',
    u'Häschen Spaß',
    u'ავლაბრის ფონდი',
]

for sample in SAMPLES:
    print 'SAMPLE :', sample
    print '  NORM :', normalize(sample)
    print '  SLUG :', slugify(sample)
    print '  LATIN:', latinize_text(sample)
    print '  ASCII:', ascii_text(sample)
Ejemplo n.º 39
0
def normalizeDBcol(col):
    return slugify(col).replace('-', '_')
Ejemplo n.º 40
0
 def headers(self):
     # normalize header names
     headers = {}
     for k, v in self.data.get('headers', {}).items():
         headers[slugify(k, sep='_')] = v
     return headers
Ejemplo n.º 41
0
 def node_id(self, value):
     return "name:%s" % slugify(value)
Ejemplo n.º 42
0
def make_id(*parts):
    parts = [unicode(p) for p in parts if p is not None]
    parts = [slugify(p, sep='-') for p in parts if len(p)]
    return ':'.join(parts)
Ejemplo n.º 43
0
 def add_codelist_data(codelist_name, codelist_code):
     codelistcode = models.CodelistCode()
     codelistcode.code = codelist_code["code"]
     codelistcode.name = codelist_code["name"]
     codelistcode.codelist_code = normality.slugify(codelist_name)
     db.session.add(codelistcode)
Ejemplo n.º 44
0
def slug(name):
    return slugify(name, sep='_')
Ejemplo n.º 45
0
 def headers(self):
     raw = self.meta.get('headers', {})
     return {slugify(k, sep='_'): v for k, v in raw.items()}
Ejemplo n.º 46
0
def crawl_person(context, name, url):
    # context.log.info("Crawling member", name=name, url=url)
    res = context.http.get(url)
    doc = html.fromstring(res.text)
    _, person_id = url.rsplit("/", 1)
    person = context.make("Person")
    person.id = f"eu-cor-{person_id}"
    person.add("sourceUrl", url)
    person.add("name", name)
    person.add("topics", "role.pep")

    last_name, first_name = name.split(", ", 1)
    person.add("firstName", first_name)
    person.add("lastName", last_name)

    address = {}
    details = doc.find('.//div[@class="regular-details"]')
    for row in details.findall('.//ul[@class="no-bullet"]/li'):
        children = row.getchildren()
        title = children[0]
        title_text = collapse_spaces(stringify(title.text_content()))
        title_text = title_text or title.get("class")
        value = collapse_spaces(title.tail)
        if title_text in ("Full name:", "Address:",
                          "Declaration of interests"):
            # ignore these.
            continue
        if title_text == "Emails:":
            emails = [e.text for e in row.findall(".//a")]
            person.add("email", emails)
            continue
        if "glyphicon-phone" in title_text:
            person.add("phone", value.split(","))
            continue
        if "fa-fax" in title_text:
            # TODO: yeah, no
            # person.add("phone", value)
            continue
        if title_text in ("Web sites:", "list-inline"):
            sites = [e.get("href") for e in row.findall(".//a")]
            person.add("website", sites)
            continue
        if title_text == "Represented Country:":
            person.add("country", value)
            continue
        if title_text == "Languages:":
            # TODO: missing in FtM
            # person.add("languages", value.split(','))
            continue
        if "Regions since:" in title_text:
            date = parse_date(value)
            person.context["created_at"] = date
            continue
        if "Date of birth:" in title_text:
            person.add("birthDate", parse_date(value))
            continue
        if "Commissions:" in title_text:
            for com in row.findall(".//li"):
                text = collapse_spaces(com.text_content())
                sep = "Mandate - "
                if sep in text:
                    _, text = text.split(sep, 1)
                person.add("sector", text)
            continue
        if "Areas of interest:" in title_text:
            for area in row.findall(".//li"):
                person.add("keywords", area.text_content())
            continue
        if title.tag == "i" and value is None:
            person.add("position", title_text)
            continue
        if title_text in ("Country:"):
            person.add("country", value)
        if title_text in ("Street:", "Postal code:", "City:", "Country:"):
            address[title_text.replace(":", "")] = value
            continue
        if title_text == "Political group:":
            group = context.make("Organization")
            group.add("name", value)
            slug = value
            if "(" in slug:
                _, slug = slug.rsplit("(", 1)
            slug = slugify(slug, sep="-")
            group.id = f"eu-cor-group-{slug}"
            context.emit(group)
            member = context.make("Membership")
            member.make_id("Membership", person.id, group.id)
            member.add("member", person)
            member.add("organization", group)
            context.emit(member)
            continue

    address = (
        address.get("Street"),
        address.get("City"),
        address.get("Posal code"),
        address.get("Country"),
    )
    address = ", ".join([a for a in address if a is not None])
    person.add("address", address)
    context.emit(person)
Ejemplo n.º 47
0
 def headers(self):
     # normalize header names
     raw = self._headers or {}
     return {slugify(k, sep='_'): v for k, v in raw.items()}
Ejemplo n.º 48
0
 def create(cls, name, *keys):
     keys = [slugify(k, sep='-') for k in keys]
     entity_id = '-'.join([k for k in keys if k is not None])
     entity_id = '%s.%s' % (name, entity_id)
     entity = Entity(source=name, id=entity_id)
     return entity
Ejemplo n.º 49
0
 def node_id(self, value):
     return 'name:%s' % slugify(value)
Ejemplo n.º 50
0
 def node_id(self, value):
     return 'addr:%s' % slugify(value)
Ejemplo n.º 51
0
 def display_slug(self):
     return slugify(self.display_label, sep=' ')
Ejemplo n.º 52
0
 def test_slugify(self):
     text = u'BABY! camel-is good'
     self.assertEqual('baby-camel-is-good', slugify(text, sep='-'))
Ejemplo n.º 53
0
 def test_german(self):
     text = u"Häschen Spaß"
     self.assertEqual("Haschen Spass", ascii_text(text))
     self.assertEqual("haschen-spass", slugify(text, sep="-"))
Ejemplo n.º 54
0
 def node_id(self, value: str) -> Optional[str]:
     slug = slugify(value)
     if slug is None:
         return None
     return f"name:{slug}"
Ejemplo n.º 55
0
 def node_id(self, value: str) -> Optional[str]:
     slug = slugify(value)
     if slug is None:
         return None
     return f"addr:{value}"
Ejemplo n.º 56
0
 def test_petro(self):
     text = u'Порошенко Петро Олексійович'
     self.assertEqual('porosenko-petro-oleksijovic', slugify(text))
     self.assertEqual('Porosenko Petro Oleksijovic', ascii_text(text))
     self.assertEqual(u'Porošenko Petro Oleksíjovič', latinize_text(text))
     self.assertEqual(u'порошенко петро олексіиович', normalize(text))
Ejemplo n.º 57
0
def convert_snakecase(name):
    if name.upper() != name:
        name = titlecase(name)
    return slugify(name, sep='_')
Ejemplo n.º 58
0
 def __init__(self, matcher, out_dir, file_name):
     self.file_name = decode_path(file_name)
     base_name = slugify(file_name, sep='_')
     super(CsvTableSearcher, self).__init__(matcher, out_dir, base_name)
Ejemplo n.º 59
0
import os
from normality import slugify

PROJECT_NAME = 'Follow the Money'
PROJECT_NAME = os.environ.get('INTEGRATE_PROJECT_NAME', PROJECT_NAME)

DATABASE_URI = 'sqlite:///integrate.sqlite3'
DATABASE_URI = os.environ.get('INTEGRATE_DATABASE_URI', DATABASE_URI)
DATABASE_PREFIX = os.environ.get('INTEGRATE_DATABASE_PREFIX', 'ftm')
DATABASE_PREFIX = slugify(DATABASE_PREFIX, sep='_')

QUORUM = int(os.environ.get('INTEGRATE_QUORUM', 1))
Ejemplo n.º 60
0
 def url(self):
     study_url = self.base + '/' + slugify(self.title)
     if self.area is None or type(self.area) is list:
         return study_url
     else:
         return study_url + '/' + self.area.lower()