def ipp_node_to_element(name, node): """ A `node` is a dict or a list produced by `build_tree_from_yaml_clean` or `transform_ipp_tree`. """ if isinstance(node, dict): if node.get('TYPE') == u'BAREME': bareme_element = etree.Element('BAREME', attrib = dict( code = strings.slugify(name, separator = u'_'), origin = u'ipp', )) for slice_name in node.get('SEUIL', {}).keys(): tranche_element = etree.Element('TRANCHE', attrib = dict( code = strings.slugify(slice_name, separator = u'_'), )) seuil_element = etree.Element('SEUIL') values, format, type = prepare_xml_values(name, node.get('SEUIL', {}).get(slice_name, [])) transform_values_to_element_children(values, seuil_element) if len(seuil_element) > 0: tranche_element.append(seuil_element) taux_element = etree.Element('TAUX') values, format, type = prepare_xml_values(name, node.get('TAUX', {}).get(slice_name, [])) transform_values_to_element_children(values, taux_element) if len(taux_element) > 0: tranche_element.append(taux_element) if len(tranche_element) > 0: bareme_element.append(tranche_element) return bareme_element if len(bareme_element) > 0 else None else: node_element = etree.Element('NODE', attrib = dict( code = strings.slugify(name, separator = u'_'), origin = u'ipp', )) for key, value in node.items(): child_element = ipp_node_to_element(key, value) if child_element is not None: node_element.append(child_element) return node_element if len(node_element) > 0 else None else: assert isinstance(node, list), node values, format, type = prepare_xml_values(name, node) if not values: return None code_element = etree.Element('CODE', attrib = dict( code = strings.slugify(name, separator = u'_'), origin = u'ipp', )) if format is not None: code_element.set('format', format) if type is not None: code_element.set('type', type) transform_values_to_element_children(values, code_element) return code_element if len(code_element) > 0 else None
def iter_categories_slug(organism_types_only = False, tags_slug = None, term = None): intersected_sets = [] if organism_types_only: intersected_sets.append(set(category_slug_by_pivot_code.itervalues())) for tag_slug in set(tags_slug or []): if tag_slug is not None: intersected_sets.append(categories_slug_by_tag_slug.get(tag_slug)) if term: prefixes = strings.slugify(term).split(u'-') categories_slug_by_prefix = {} for prefix in prefixes: if prefix in categories_slug_by_prefix: # TODO? Handle categories with several words sharing the same prefix? continue categories_slug_by_prefix[prefix] = union_set( word_categories_slug for word, word_categories_slug in categories_slug_by_word.iteritems() if word.startswith(prefix) ) or set() intersected_sets.extend(categories_slug_by_prefix.itervalues()) categories_slug = intersection_set(intersected_sets) if categories_slug is None: return category_by_slug.iterkeys() return categories_slug
def json_to_dated_python(self): enum = self.enum if enum is None: return conv.pipe( conv.test_isinstance((basestring, int)), conv.anything_to_int, ) # This converters accepts either an item number or an item name. index_by_slug = self.index_by_slug if index_by_slug is None: self.index_by_slug = index_by_slug = dict( (strings.slugify(name), index) for index, name in sorted(enum._vars.iteritems())) return conv.pipe( conv.test_isinstance((basestring, int)), conv.condition( conv.anything_to_int, conv.pipe( # Verify that item index belongs to enumeration. conv.anything_to_int, conv.test_in(enum._vars), ), conv.pipe( # Convert item name to its index. conv.input_to_slug, conv.test_in(index_by_slug), conv.function(lambda slug: index_by_slug[slug]), ), ), )
def json_to_dated_python(self): enum = self.enum if enum is None: return conv.pipe( conv.test_isinstance((basestring, int)), conv.anything_to_int, ) # This converters accepts either an item number or an item name. index_by_slug = self.index_by_slug if index_by_slug is None: self.index_by_slug = index_by_slug = dict( (strings.slugify(name), index) for index, name in sorted(enum._vars.iteritems()) ) return conv.pipe( conv.test_isinstance((basestring, int)), conv.condition( conv.anything_to_int, conv.pipe( # Verify that item index belongs to enumeration. conv.anything_to_int, conv.test_in(enum._vars), ), conv.pipe( # Convert item name to its index. conv.input_to_slug, conv.test_in(index_by_slug), conv.function(lambda slug: index_by_slug[slug]), ), ), )
def user_extract(req): ctx = contexts.Ctx(req) user = model.get_user(ctx, check = True) if user.email is None: return wsgihelpers.forbidden(ctx) legislation = ctx.node if legislation.is_owner(ctx) and legislation.is_dated: return wsgihelpers.bad_request(ctx, explanation = ctx._(u'This legislation is already dated.')) params = req.GET inputs = { 'date': params.get('date'), } data, errors = conv.struct({ 'date': conv.pipe( conv.french_formatted_str_to_datetime, conv.default(datetime.datetime.utcnow()), ), })(inputs, state = ctx) if errors is not None: return wsgihelpers.bad_request(ctx, explanation = errors) new_legislation = None new_legislation_title = ctx._(u'{} (copy {})').format(legislation.title, user.email) new_legislation_slug = strings.slugify(new_legislation_title) existing_legislations_cursor = model.Legislation.find( dict( slug = new_legislation_slug, ), as_class = collections.OrderedDict, ) if existing_legislations_cursor.count() > 0: for existing_legislation in existing_legislations_cursor: if existing_legislation.is_owner(ctx): return wsgihelpers.redirect(ctx, location = existing_legislation.get_user_url(ctx)) if new_legislation is None: return wsgihelpers.bad_request( ctx, explanation = ctx._(u'A legislation with the same name already exists.'), ) else: new_legislation = model.Legislation( author_id = user._id, datetime_begin = legislation.datetime_begin, datetime_end = legislation.datetime_end, description = ctx._(u'Copy of legislation "{}"').format(legislation.title), title = new_legislation_title, slug = new_legislation_slug, ) response = requests.post( conf['api.urls.legislations'], headers = { 'Content-Type': 'application/json', 'User-Agent': conf['app_name'], }, data = json.dumps(dict(date = data['date'].isoformat(), legislation = legislation.json)), ) new_legislation.json = response.json(object_pairs_hook = collections.OrderedDict).get('dated_legislation') new_legislation.save(safe = True) return wsgihelpers.redirect(ctx, location = new_legislation.get_user_url(ctx))
def compute_words(self): self.words = sorted( set( strings.slugify(u'-'.join( unicode(fragment) for fragment in ( self._id, self.email, self.full_name, ) if fragment is not None)).split(u'-'))) or None
def compute_words(self): self.words = sorted( set( strings.slugify(u'-'.join( unicode(fragment) for fragment in ( self._id, self.description, self.title, ) if fragment is not None)).split(u'-'))) or None
def setUp(self): # noqa super(TestLegislations, self).setUp() self.ctx = contexts.Ctx() legislation_title = u'Legislation 1' self.legislation = model.Legislation( description=legislation_title, slug=strings.slugify(legislation_title), title=legislation_title, ) self.legislation.save(safe=True)
def compute_words(self): self.words = sorted(set(strings.slugify(u'-'.join( unicode(fragment) for fragment in ( self._id, self.email, self.full_name, ) if fragment is not None )).split(u'-'))) or None
def compute_words(self): self.words = sorted(set(strings.slugify(u'-'.join( unicode(fragment) for fragment in ( self._id, self.description, self.title, ) if fragment is not None )).split(u'-'))) or None
def setUp(self): # noqa super(TestLegislations, self).setUp() self.ctx = contexts.Ctx() legislation_title = u'Legislation 1' self.legislation = model.Legislation( description = legislation_title, slug = strings.slugify(legislation_title), title = legislation_title, ) self.legislation.save(safe = True)
def iter_ids(cls, ctx, territory = None, coverages = None, term = None): intersected_sets = [] if territory is not None: ancestor_territories_poi_sets = [] for ancestor_id in territory.ancestors_id: ancestor_territories_poi_sets.append(cls.ids_by_territory_id.get(ancestor_id, set())) for child_territory_id in ramdb.territories_id_by_ancestor_id.get(territory._id): ancestor_territories_poi_sets.append(cls.sim_ids_by_territory_id.get(child_territory_id, set())) intersected_sets.append(ramdb.union_set(ancestor_territories_poi_sets)) for coverage in (coverages or []): coverage_slug = strings.slugify(coverage) coverage_pois_id = cls.ids_by_coverage.get(coverage_slug) if not coverage_pois_id: return set() intersected_sets.append(coverage_pois_id) # We should filter on term *after* having looked for competent organizations. Otherwise, when no organization # matching term is found, the nearest organizations will be used even when there are competent organizations # (that don't match the term). if term and isinstance(term, basestring): prefixes = strings.slugify(term).split(u'-') pois_id_by_prefix = {} for prefix in prefixes: if prefix in pois_id_by_prefix: # TODO? Handle pois with several words sharing the same prefix? continue pois_id_by_prefix[prefix] = ramdb.union_set( pois_id for word, pois_id in cls.ids_by_word.iteritems() if word.startswith(prefix) ) or set() intersected_sets.extend(pois_id_by_prefix.itervalues()) found_pois_id = ramdb.intersection_set(intersected_sets) if found_pois_id is None: return cls.indexed_ids return found_pois_id
def duplicate(req): ctx = contexts.Ctx(req) test_case = ctx.node user = model.get_user(ctx, check=True) new_test_case_title = ctx._(u'Copy of {}').format(test_case.title) new_test_case = model.TestCase( author_id=user._id, description=new_test_case_title, title=new_test_case_title, slug=strings.slugify(new_test_case_title), ) new_test_case.save(safe=True) return wsgihelpers.redirect(ctx, location=user.get_user_url(ctx))
def duplicate(req): ctx = contexts.Ctx(req) test_case = ctx.node user = model.get_user(ctx, check = True) new_test_case_title = ctx._(u'Copy of {}').format(test_case.title) new_test_case = model.TestCase( author_id = user._id, description = new_test_case_title, title = new_test_case_title, slug = strings.slugify(new_test_case_title), ) new_test_case.save(safe = True) return wsgihelpers.redirect(ctx, location = user.get_user_url(ctx))
def convert_taxipp_name_tree(value, state = None): return conv.condition( conv.test_isinstance(dict), conv.pipe( conv.uniform_mapping( conv.test_isinstance(basestring), convert_taxipp_name_tree, ), conv.empty_to_none, ), conv.pipe( conv.test_isinstance(basestring), conv.translate({u'nc': None}), conv.test(lambda taxipp_name: strings.slugify(taxipp_name, separator = u'_') == taxipp_name.strip(u'_'), error = N_(u'Invalid TaxIPP name')), ), )(value, state = state or conv.default_state)
def convert_element_to_article(self, element, updated): title_url = None for xpath in ( './/h1', './/h2', './/h3', './/h4', './/h5', './/h6', ): heading_elements = element.xpath(xpath) if len(heading_elements) > 0: title = lxml.html.tostring(heading_elements[0], encoding=unicode, method='text').strip() # Remove header from article element. header_element = None for ancestor_element in iter_element_ancestors( heading_elements[0]): if ancestor_element.tag in ('a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup') \ or ancestor_element.tag == 'div' and ancestor_element.get('class') == 'page-header': header_element = ancestor_element if ancestor_element.tag == 'a': url, error = conv.pipe( conv.make_input_to_url(), conv.not_none, )(ancestor_element.get('href'), state=self.ctx) if error is None: title_url = url header_element.getparent().remove(header_element) break else: title = None return dict( element=element, hash=element.get('id') or strings.slugify(title), id=element.get('id'), node=self, title=title, title_url=title_url, updated=get_element_time(element, default=updated), )
def edit(req): ctx = contexts.Ctx(req) user = model.get_user(ctx, check=True) params = req.params inputs = { 'title': params.get('title'), 'description': params.get('description'), } data, errors = conv.struct({ 'title': conv.cleanup_line, 'description': conv.cleanup_line, })(inputs, state=ctx) if errors is not None: return wsgihelpers.bad_request(ctx, explanation=errors) test_case = ctx.node test_case.description = data['description'] test_case.slug = strings.slugify(data['title']) test_case.title = data['title'] test_case.save(safe=True) return wsgihelpers.redirect(ctx, location=user.get_user_url(ctx))
def edit(req): ctx = contexts.Ctx(req) user = model.get_user(ctx, check = True) params = req.params inputs = { 'title': params.get('title'), 'description': params.get('description'), } data, errors = conv.struct({ 'title': conv.cleanup_line, 'description': conv.cleanup_line, })(inputs, state = ctx) if errors is not None: return wsgihelpers.bad_request(ctx, explanation = errors) test_case = ctx.node test_case.description = data['description'] test_case.slug = strings.slugify(data['title']) test_case.title = data['title'] test_case.save(safe = True) return wsgihelpers.redirect(ctx, location = user.get_user_url(ctx))
def csv_infos_to_csv_bytes(csv_infos_by_schema_name, state = None): from . import ramdb if csv_infos_by_schema_name is None: return None, None if state is None: state = default_state csv_bytes_by_name = {} for schema_name, csv_infos in csv_infos_by_schema_name.iteritems(): csv_file = StringIO() writer = csv.writer(csv_file, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL) writer.writerow([ (label or u'').encode("utf-8") for label in csv_infos['columns_label'] ]) for row in csv_infos['rows']: writer.writerow([ unicode(cell).encode('utf-8') if cell is not None else None for cell in row ]) csv_filename = '{0}.csv'.format(strings.slugify(ramdb.schema_title_by_name.get(schema_name, schema_name))) csv_bytes_by_name[csv_filename] = csv_file.getvalue() return csv_bytes_by_name or None, None
def convert_element_to_article(self, element, updated): title_url = None for xpath in ( './/h1', './/h2', './/h3', './/h4', './/h5', './/h6', ): heading_elements = element.xpath(xpath) if len(heading_elements) > 0: title = lxml.html.tostring(heading_elements[0], encoding = unicode, method = 'text').strip() # Remove header from article element. header_element = None for ancestor_element in iter_element_ancestors(heading_elements[0]): if ancestor_element.tag in ('a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup') \ or ancestor_element.tag == 'div' and ancestor_element.get('class') == 'page-header': header_element = ancestor_element if ancestor_element.tag == 'a': url, error = conv.pipe( conv.make_input_to_url(), conv.not_none, )(ancestor_element.get('href'), state = self.ctx) if error is None: title_url = url header_element.getparent().remove(header_element) break else: title = None return dict( element = element, hash = element.get('id') or strings.slugify(title), id = element.get('id'), node = self, title = title, title_url = title_url, updated = get_element_time(element, default = updated), )
def slugify_ipp_translation_key(key): return key if key in ("RENAME", "TYPE") else strings.slugify(key, separator=u"_")
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dir', default='Baremes_IPP_2015', help='path of IPP XLS directory') parser.add_argument('-v', '--verbose', action='store_true', default=False, help="increase output verbosity") args = parser.parse_args() # args.dir = path logging.basicConfig( level=logging.DEBUG if args.verbose else logging.WARNING, stream=sys.stdout) root_node = dict( children=[], name="root", text=textwrap.dedent(u"""\ Ce document présente l'ensemble de la législation permettant le calcul des contributions sociales, taxes sur les salaires et cotisations sociales. Il s'agit des barèmes bruts de la législation utilisés dans le micro-simulateur de l'IPP, TAXIPP. Les sources législatives (texte de loi, numéro du décret ou arrêté) ainsi que la date de publication au Journal Officiel de la République française (JORF) sont systématiquement indiquées. La première ligne du fichier (masquée) indique le nom des paramètres dans TAXIPP. Citer cette source : Barèmes IPP: prélèvements sociaux, Institut des politiques publiques, avril 2014. Auteurs : Antoine Bozio, Julien Grenet, Malka Guillot, Laura Khoury et Marianne Tenand Contacts : [email protected]; [email protected]; [email protected] Licence : Licence ouverte / Open Licence """).split(u'\n'), title=u"Barème IPP", type=u'NODE', ) for bareme in baremes: xls_path = os.path.join(args.dir.decode('utf-8'), u"Baremes IPP - {0}.xls".format(bareme)) if not os.path.exists(xls_path): log.warning("Skipping file {} that doesn't exist: {}".format( bareme, xls_path)) continue log.info(u'Parsing file {}'.format(bareme)) book = xlrd.open_workbook(filename=xls_path, formatting_info=True) sheet_names = [ sheet_name for sheet_name in book.sheet_names() if not sheet_name.startswith((u'Abréviations', u'Outline')) and sheet_name not in forbiden_sheets.get(bareme, []) ] sheet_title_by_name = {} for sheet_name in sheet_names: log.info(u' Parsing sheet {}'.format(sheet_name)) sheet = book.sheet_by_name(sheet_name) # Extract coordinates of merged cells. merged_cells_tree = {} for row_low, row_high, column_low, column_high in sheet.merged_cells: for row_index in range(row_low, row_high): cell_coordinates_by_merged_column_index = merged_cells_tree.setdefault( row_index, {}) for column_index in range(column_low, column_high): cell_coordinates_by_merged_column_index[ column_index] = (row_low, column_low) if sheet_name.startswith(u'Sommaire'): # Associate the titles of the sheets to their Excel names. for row_index in range(sheet.nrows): linked_sheet_number = transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, 2) if isinstance(linked_sheet_number, int): linked_sheet_title = transform_xls_cell_to_str( book, sheet, merged_cells_tree, row_index, 3) if linked_sheet_title is not None: hyperlink = get_hyperlink(sheet, row_index, 3) if hyperlink is not None and hyperlink.type == u'workbook': linked_sheet_name = hyperlink.textmark.split( u'!', 1)[0].strip(u'"').strip(u"'") sheet_title_by_name[ linked_sheet_name] = linked_sheet_title continue descriptions_rows = [] labels_rows = [] notes_rows = [] state = 'taxipp_names' taxipp_names_row = None values_rows = [] for row_index in range(sheet.nrows): columns_count = len(sheet.row_values(row_index)) if state == 'taxipp_names': taxipp_names_row = [ taxipp_name for taxipp_name in ( transform_xls_cell_to_str( book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count)) ] state = 'labels' continue if state == 'labels': first_cell_value = transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, 0) date_or_year, error = conv.pipe( conv.test_isinstance((int, basestring)), cell_to_date, conv.not_none, )(first_cell_value, state=conv.default_state) if error is not None: # First cell of row is not a date => Assume it is a label. labels_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ]) continue state = 'values' if state == 'values': first_cell_value = transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, 0) if first_cell_value is None or isinstance( first_cell_value, (int, basestring)): date_or_year, error = cell_to_date( first_cell_value, state=conv.default_state) if error is None: # First cell of row is a valid date or year. values_row = [ transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ] if date_or_year is not None: assert date_or_year.year < 2601, 'Invalid date {} in {} at row {}'.format( date_or_year, sheet_name, row_index + 1) values_rows.append(values_row) continue if all(value in (None, u'') for value in values_row): # If first cell is empty and all other cells in line are also empty, ignore this line. continue # First cell has no date and other cells in row are not empty => Assume it is a note. state = 'notes' if state == 'notes': first_cell_value = transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, 0) if isinstance(first_cell_value, basestring) and first_cell_value.strip( ).lower() == 'notes': notes_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ]) continue state = 'description' assert state == 'description' descriptions_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ]) text_lines = [] for row in notes_rows: text_lines.append(u' | '.join(cell for cell in row if cell)) if text_lines: text_lines.append(None) for row in descriptions_rows: text_lines.append(u' | '.join(cell for cell in row if cell)) sheet_title = sheet_title_by_name.get(sheet_name) if sheet_title is None: log.warning(u"Missing title for sheet {} in summary".format( sheet_name)) continue labels = [] for labels_row in labels_rows: for column_index, label in enumerate(labels_row): if not label: continue while column_index >= len(labels): labels.append([]) labels_column = labels[column_index] if not labels_column or labels_column[-1] != label: labels_column.append(label) labels = [ tuple(labels_column1) if len(labels_column1) > 1 else labels_column1[0] for labels_column1 in labels ] cell_by_label_rows = [] for value_row in values_rows: cell_by_label = collections.OrderedDict( itertools.izip(labels, value_row)) cell_by_label, errors = values_row_converter( cell_by_label, state=conv.default_state) assert errors is None, "Errors in {}:\n{}".format( cell_by_label, errors) cell_by_label_rows.append(cell_by_label) sheet_node = dict( children=[], name=strings.slugify(sheet_name, separator=u'_'), text=text_lines, title=sheet_title, type=u'NODE', ) root_node['children'].append(sheet_node) for taxipp_name, labels_column in zip(taxipp_names_row, labels): if not taxipp_name or taxipp_name in (u'date', ): continue variable_node = dict( children=[], name=strings.slugify(taxipp_name, separator=u'_'), title=u' - '.join(labels_column) if isinstance( labels_column, tuple) else labels_column, type=u'CODE', ) sheet_node['children'].append(variable_node) for cell_by_label in cell_by_label_rows: amount_and_unit = cell_by_label[labels_column] variable_node['children'].append( dict( law_reference=cell_by_label[ u'Références législatives'], notes=cell_by_label[u'Notes'], publication_date=cell_by_label[u"Parution au JO"], start_date=cell_by_label[ u"Date d'entrée en vigueur"], type=u'VALUE', unit=amount_and_unit[1] if isinstance( amount_and_unit, tuple) else None, value=amount_and_unit[0] if isinstance( amount_and_unit, tuple) else amount_and_unit, )) # dates = [ # conv.check(cell_to_date)( # row[1] if bareme == u'Impot Revenu' else row[0], # state = conv.default_state, # ) # for row in values_rows # ] # for column_index, taxipp_name in enumerate(taxipp_names_row): # if taxipp_name and strings.slugify(taxipp_name) not in ( # 'date', # 'date-ir', # 'date-rev', # 'note', # 'notes', # 'ref-leg', # ): # vector = [ # transform_cell_value(date, row[column_index]) # for date, row in zip(dates, values_rows) # ] # vector = [ # cell if not isinstance(cell, basestring) or cell == u'nc' else '-' # for cell in vector # ] # # vector_by_taxipp_name[taxipp_name] = pd.Series(vector, index = dates) # vector_by_taxipp_name[taxipp_name] = vector # print_node(root_node) return 0
def transform(xls_dir, yaml_raw_dir): file_system_encoding = sys.getfilesystemencoding() error_by_book_name = collections.OrderedDict() warning_by_book_name = collections.OrderedDict() for filename_encoded in sorted(os.listdir(xls_dir)): if not filename_encoded.endswith('.xls'): continue filename = filename_encoded.decode(file_system_encoding) log.info(u'Parsing file {}'.format(filename)) book_name = os.path.splitext(filename)[0] xls_path_encoded = os.path.join(xls_dir, filename_encoded) book = xlrd.open_workbook(filename = xls_path_encoded, formatting_info = True) book_yaml_dir_encoded = os.path.join(yaml_raw_dir, strings.slugify(book_name).encode(file_system_encoding)) if not os.path.exists(book_yaml_dir_encoded): os.makedirs(book_yaml_dir_encoded) error_by_sheet_name = collections.OrderedDict() sheet_english_title_by_name = collections.OrderedDict() sheet_title_by_name = collections.OrderedDict() warning_by_sheet_name = collections.OrderedDict() for sheet_name in book.sheet_names(): log.info(u' Parsing sheet {}.'.format(sheet_name)) sheet = book.sheet_by_name(sheet_name) sheet_error = None sheet_warning = None try: # Extract coordinates of merged cells. merged_cells_tree = {} for row_low, row_high, column_low, column_high in sheet.merged_cells: for row_index in range(row_low, row_high): cell_coordinates_by_merged_column_index = merged_cells_tree.setdefault( row_index, {}) for column_index in range(column_low, column_high): cell_coordinates_by_merged_column_index[column_index] = (row_low, column_low) if sheet_name.startswith((u'Sommaire', u'Outline')): french = sheet_name.startswith(u'Sommaire') # Associate the titles of the sheets to their Excel names. book_title = transform_xls_cell_to_str(book, sheet, merged_cells_tree, 1, 1) if not book_title: book_title = transform_xls_cell_to_str(book, sheet, merged_cells_tree, 2, 1) book_title = book_title.strip() assert book_title book_description = transform_xls_cell_to_str(book, sheet, merged_cells_tree, 4, 1) if not book_description: book_description = transform_xls_cell_to_str(book, sheet, merged_cells_tree, 5, 1) book_description = book_description.strip() assert book_description for column_index in range(1, 4): current_heading = u'Annexes' if french else u'Annexes' sheet_title_by_slug_by_heading = collections.OrderedDict() for row_index in range(sheet.nrows): heading = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 1) if isinstance(heading, basestring): # noqa F821 heading = heading.strip() if not heading: continue if heading == book_title or heading == book_description: continue if number_re.match(heading) is None: current_heading = heading continue linked_sheet_number = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, column_index) if isinstance(linked_sheet_number, int) or (isinstance(linked_sheet_number, basestring) and number_re.match(linked_sheet_number) is not None): # noqa F821 linked_sheet_title = transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index + 1) if linked_sheet_title is not None: linked_sheet_title = linked_sheet_title.strip() if linked_sheet_title: hyperlink = get_hyperlink(sheet, row_index, column_index + 1) if hyperlink is not None and hyperlink.type == u'workbook': linked_sheet_name = hyperlink.textmark.split(u'!', 1)[0].strip(u'"').strip(u"'") sheet_title_by_slug = sheet_title_by_slug_by_heading.setdefault(current_heading, collections.OrderedDict()) sheet_title_by_slug[strings.slugify(linked_sheet_name)] = linked_sheet_title if french: sheet_title_by_name[linked_sheet_name] = linked_sheet_title else: sheet_english_title_by_name[linked_sheet_name] = linked_sheet_title if sheet_title_by_slug_by_heading: break assert sheet_title_by_slug_by_heading book_notes = [] for column_index in range(8, 12): for row_index in range(sheet.nrows): note = transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) if note and note.strip() == book_description: continue if book_notes or note: book_notes.append((note or u'').rstrip()) if note: blank_notes_count = 0 elif blank_notes_count >= 1: break else: blank_notes_count += 1 if book_notes: break while book_notes and not book_notes[-1]: del book_notes[-1] assert book_notes sheet_node = collections.OrderedDict(( (u'Titre' if french else u'Title', book_title), (u'Description' if french else u'Description', book_description), (u'Sommaire' if french else u'Table of Content', sheet_title_by_slug_by_heading), (u'Notes' if french else u'Notes', literal_unicode(u'\n'.join(book_notes))), (u'Données initiales' if french else u'Source Data', collections.OrderedDict(( (u'Producteur' if french else u'Producer', u'Institut des politiques publiques'), (u'Format', u'XLS'), (u'URL', u'http://www.ipp.eu/outils/baremes-ipp/' if french else u'http://www.ipp.eu/en/tools/ipp-tax-and-benefit-tables/'), ))), (u'Convertisseur' if french else u'Converter', collections.OrderedDict(( (u'URL', u'https://git.framasoft.org/french-tax-and-benefit-tables/ipp-tax-and-benefit-tables-converters'), # noqa ))), (u'Données générées' if french else u'Generated Data', collections.OrderedDict(( (u'Format', u'YAML'), (u'URL', u'https://git.framasoft.org/french-tax-and-benefit-tables/ipp-tax-and-benefit-tables-yaml-raw'), # noqa ))), (u'Licence' if french else u'License', u'Licence ouverte <http://www.etalab.gouv.fr/licence-ouverte-open-licence>' if french else u'Open Licence <http://www.etalab.gouv.fr/licence-ouverte-open-licence>'), )) yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, (strings.slugify(sheet_name, transform = strings.upper) + u'.yaml').encode( file_system_encoding), ) elif sheet_name.startswith(u'Abréviation'): log.warning(u' Ignoring sheet {} of book {}.'.format(sheet_name, book_name)) sheet_warning = u'Sheet ignored.' sheet_title = sheet_title_by_name.get(sheet_name, sheet_name) sheet_node = collections.OrderedDict(( (u'Titre' if french else u'Title', sheet_title), )) yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, (strings.slugify(sheet_name, transform = strings.upper) + u'.yaml').encode( file_system_encoding), ) else: descriptions_rows = [] labels_rows = [] notes_rows = [] state = 'taxipp_names' taxipp_names_row = None values_rows = [] for row_index in range(sheet.nrows): columns_count = len(sheet.row_values(row_index)) if state == 'taxipp_names': taxipp_names_row = [ (taxipp_name or u'').strip() for taxipp_name in ( transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ) ] state = 'labels' if all( not taxipp_name for taxipp_name in taxipp_names_row ): # The first row is empty => This sheet doesn't contain TaxIPP names. continue # When any TaxIPP name is in lowercase, assume that this row is really the TaxIPP names row. if any( taxipp_name and taxipp_name[0].islower() for taxipp_name in taxipp_names_row ): continue else: log.info(u' Sheet "{}" of XLS file "{}" has no row for TaxIPP names.'.format( sheet_name, filename)) # warning = u'Row not found' # if sheet_warning is None: # sheet_warning = collections.OrderedDict() # if isinstance(sheet_warning, dict): # sheet_warning[u'Noms TaxIPP'] = warning # else: # assert isinstance(sheet_warning, basestring), sheet_warning # sheet_warning = u'\n\n'.join( # fragment # for fragment in (sheet_warning, warning) # if fragment # ) taxipp_names_row = [] if state == 'labels': first_cell_value, error = conv.pipe(cell_to_row_first_cell, conv.not_none)( transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0), state = conv.default_state) if error is not None: # First cell of row is not a the first cell of a row of values => Assume it is a label. labels_rows.append([ u' '.join((label or u'').split()).strip() for label in ( transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ) ]) continue state = 'values' if state == 'values': first_cell_value, error = cell_to_row_first_cell( transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0), state = conv.default_state) if error is None: # First cell of row is a valid date or year. values_row = [ value.strip() if isinstance(value, basestring) else value # noqa F821 for value in ( transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, column_index, empty_white_value = u'nc') for column_index in range(columns_count) ) ] if isinstance(first_cell_value, datetime.date): assert first_cell_value.year < 2601, 'Invalid date {} in {} at row {}'.format( first_cell_value, sheet_name, row_index + 1) values_rows.append(values_row) continue if isinstance(first_cell_value, basestring) and aad_re.match(first_cell_value) is not None: # noqa F821 values_rows.append(values_row) continue if all(value in (None, u'', u'nc') for value in values_row): # If first cell is empty and all other cells in line are also empty, ignore this # line. continue # First cell has no date and other cells in row are not empty => Assume it is a note. state = 'notes' if state == 'notes': first_cell_value = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0) if isinstance(first_cell_value, basestring) and first_cell_value.strip().lower() == 'notes': # noqa F821 notes_rows.append([ (line or u'').rstrip() for line in ( transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ) ]) continue state = 'description' assert state == 'description' descriptions_rows.append([ (line or u'').strip() for line in ( transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ) ]) sheet_node = collections.OrderedDict() sheet_title = sheet_title_by_name.get(sheet_name) if sheet_title is not None: sheet_node[u'Titre'] = sheet_title sheet_node[u'Titre court'] = sheet_name labels = [] for labels_row in labels_rows: for column_index, label in enumerate(labels_row): if label is None: continue label = label.strip() if not label: continue while column_index >= len(labels): labels.append([]) column_labels = labels[column_index] if not column_labels or column_labels[-1] != label: column_labels.append(label) labels = [ (tuple( label_stripped for label_stripped in ( (label or u'').strip() for label in column_labels1 ) if label_stripped ) if column_labels1 else None) or (u'Colonne sans titre',) for index, column_labels1 in enumerate(labels, 1) ] assert labels taxipp_name_by_column_labels = collections.OrderedDict() for column_labels, taxipp_name in zip(labels, taxipp_names_row): if not taxipp_name: continue taxipp_name_by_column_label = taxipp_name_by_column_labels for column_label in column_labels[:-1]: taxipp_name_by_column_label = taxipp_name_by_column_label.setdefault(column_label, collections.OrderedDict()) taxipp_name_by_column_label[column_labels[-1]] = taxipp_name if taxipp_name_by_column_labels: sheet_node[u'Noms TaxIPP'] = taxipp_name_by_column_labels sheet_values = [] for value_row in values_rows: cell_by_column_labels = collections.OrderedDict() for column_labels, cell in zip(labels, value_row): if cell is None or cell == '': continue cell_by_column_label = cell_by_column_labels for column_label in column_labels[:-1]: cell_by_column_label = cell_by_column_label.setdefault(column_label, collections.OrderedDict()) # Merge (amount, unit) couples to a string to simplify YAML. if isinstance(cell, tuple): cell = transform_amount_tuple_to_str(cell) if isinstance(cell, basestring) and u'\n' in cell: # noqa F821 cell = literal_unicode(cell) cell_by_column_label[column_labels[-1]] = cell sheet_values.append(cell_by_column_labels) if sheet_values: sheet_node[u'Valeurs'] = sheet_values notes = u'\n'.join([ line.rstrip() for line in u'\n'.join([ u' | '.join( cell for cell in row if cell ).rstrip() for row in notes_rows ]).split(u'\n') ]).rstrip() if notes: sheet_node[u'Notes'] = literal_unicode(notes) description = u'\n'.join([ line.rstrip() for line in u'\n'.join([ u' | '.join( cell for cell in row if cell ).rstrip() for row in descriptions_rows ]).split(u'\n') ]).rstrip() if description: sheet_node[u'Description'] = literal_unicode(description) yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, (strings.slugify(sheet_name) + u'.yaml').encode(file_system_encoding), ) if sheet_error: sheet_node[u'ERRORS'] = literal_unicode(sheet_error) \ if isinstance(sheet_error, basestring) and u'\n' in sheet_error else sheet_error # noqa F821 if sheet_warning: sheet_node[u'WARNINGS'] = literal_unicode(sheet_warning) \ if isinstance(sheet_warning, basestring) and u'\n' in sheet_warning else sheet_warning # noqa F821 with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(sheet_node, yaml_file, allow_unicode = True, default_flow_style = False, indent = 2, width = 120) except: # noqa E722 message = u'An exception occurred when parsing sheet "{}" of XLS file "{}".'.format(sheet_name, filename) log.exception(u' {}'.format(message)) sheet_error = literal_unicode(u'\n\n'.join( fragment for fragment in ( unicode(sheet_error) if sheet_error is not None else None, # noqa F821 message, traceback.format_exc().decode('utf-8'), ) if fragment )) if sheet_error: error_by_sheet_name[sheet_name] = sheet_error if sheet_warning: warning_by_sheet_name[sheet_name] = sheet_warning if error_by_sheet_name: yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, u'ERRORS.yaml'.encode(file_system_encoding), ) with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(error_by_sheet_name, yaml_file, allow_unicode = True, default_flow_style = False, indent = 2, width = 120) error_by_book_name[book_name] = error_by_sheet_name if warning_by_sheet_name: yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, u'WARNINGS.yaml'.encode(file_system_encoding), ) with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(warning_by_sheet_name, yaml_file, allow_unicode = True, default_flow_style = False, indent = 2, width = 120) warning_by_book_name[book_name] = warning_by_sheet_name if error_by_book_name: yaml_file_path_encoded = os.path.join( yaml_raw_dir, u'ERRORS.yaml'.encode(file_system_encoding), ) with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(error_by_book_name, yaml_file, allow_unicode = True, default_flow_style = False, indent = 2, width = 120) if warning_by_book_name: yaml_file_path_encoded = os.path.join( yaml_raw_dir, u'WARNINGS.yaml'.encode(file_system_encoding), ) with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(warning_by_book_name, yaml_file, allow_unicode = True, default_flow_style = False, indent = 2, width = 120)
def transform_node_to_element(name, node): if isinstance(node, dict): if node.get('TYPE') == u'BAREME': scale_element = etree.Element('BAREME', attrib = dict( code = strings.slugify(name, separator = u'_'), )) for slice_name in node.get('SEUIL', {}).keys(): slice_element = etree.Element('TRANCHE', attrib = dict( code = strings.slugify(slice_name, separator = u'_'), )) threshold_element = etree.Element('SEUIL') values, format, type = prepare_xml_values(name, node.get('SEUIL', {}).get(slice_name, [])) for value in values: value_element = transform_value_to_element(value) if value_element is not None: threshold_element.append(value_element) if len(threshold_element) > 0: slice_element.append(threshold_element) amount_element = etree.Element('MONTANT') values, format, type = prepare_xml_values(name, node.get('MONTANT', {}).get(slice_name, [])) for value in values: value_element = transform_value_to_element(value) if value_element is not None: amount_element.append(value_element) if len(amount_element) > 0: slice_element.append(amount_element) rate_element = etree.Element('TAUX') values, format, type = prepare_xml_values(name, node.get('TAUX', {}).get(slice_name, [])) for value in values: value_element = transform_value_to_element(value) if value_element is not None: rate_element.append(value_element) if len(rate_element) > 0: slice_element.append(rate_element) base_element = etree.Element('ASSIETTE') values, format, type = prepare_xml_values(name, node.get('ASSIETTE', {}).get(slice_name, [])) for value in values: value_element = transform_value_to_element(value) if value_element is not None: base_element.append(value_element) if len(base_element) > 0: slice_element.append(base_element) if len(slice_element) > 0: scale_element.append(slice_element) return scale_element if len(scale_element) > 0 else None else: node_element = etree.Element('NODE', attrib = dict( code = strings.slugify(name, separator = u'_'), )) for key, value in node.iteritems(): child_element = transform_node_to_element(key, value) if child_element is not None: node_element.append(child_element) return node_element if len(node_element) > 0 else None else: assert isinstance(node, list), node values, format, type = prepare_xml_values(name, node) if not values: return None code_element = etree.Element('CODE', attrib = dict( code = strings.slugify(name, separator = u'_'), )) if format is not None: code_element.set('format', format) if type is not None: code_element.set('type', type) for value in values: value_element = transform_value_to_element(value) if value_element is not None: code_element.append(value_element) return code_element if len(code_element) > 0 else None
def transform(xls_dir, yaml_raw_dir): file_system_encoding = sys.getfilesystemencoding() error_by_book_name = collections.OrderedDict() warning_by_book_name = collections.OrderedDict() for filename_encoded in sorted(os.listdir(xls_dir)): if not filename_encoded.endswith('.xls'): continue filename = filename_encoded.decode(file_system_encoding) log.info(u'Parsing file {}'.format(filename)) book_name = os.path.splitext(filename)[0] xls_path_encoded = os.path.join(xls_dir, filename_encoded) book = xlrd.open_workbook(filename=xls_path_encoded, formatting_info=True) book_yaml_dir_encoded = os.path.join( yaml_raw_dir, strings.slugify(book_name).encode(file_system_encoding)) if not os.path.exists(book_yaml_dir_encoded): os.makedirs(book_yaml_dir_encoded) error_by_sheet_name = collections.OrderedDict() sheet_english_title_by_name = collections.OrderedDict() sheet_title_by_name = collections.OrderedDict() warning_by_sheet_name = collections.OrderedDict() for sheet_name in book.sheet_names(): log.info(u' Parsing sheet {}.'.format(sheet_name)) sheet = book.sheet_by_name(sheet_name) sheet_error = None sheet_warning = None try: # Extract coordinates of merged cells. merged_cells_tree = {} for row_low, row_high, column_low, column_high in sheet.merged_cells: for row_index in range(row_low, row_high): cell_coordinates_by_merged_column_index = merged_cells_tree.setdefault( row_index, {}) for column_index in range(column_low, column_high): cell_coordinates_by_merged_column_index[ column_index] = (row_low, column_low) if sheet_name.startswith((u'Sommaire', u'Outline')): french = sheet_name.startswith(u'Sommaire') # Associate the titles of the sheets to their Excel names. book_title = transform_xls_cell_to_str( book, sheet, merged_cells_tree, 1, 1) if not book_title: book_title = transform_xls_cell_to_str( book, sheet, merged_cells_tree, 2, 1) book_title = book_title.strip() assert book_title book_description = transform_xls_cell_to_str( book, sheet, merged_cells_tree, 4, 1) if not book_description: book_description = transform_xls_cell_to_str( book, sheet, merged_cells_tree, 5, 1) book_description = book_description.strip() assert book_description for column_index in range(1, 4): current_heading = u'Annexes' if french else u'Annexes' sheet_title_by_slug_by_heading = collections.OrderedDict( ) for row_index in range(sheet.nrows): heading = transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, 1) if isinstance(heading, basestring): heading = heading.strip() if not heading: continue if heading == book_title or heading == book_description: continue if number_re.match(heading) is None: current_heading = heading continue linked_sheet_number = transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, column_index) if isinstance(linked_sheet_number, int) or ( isinstance(linked_sheet_number, basestring) and number_re.match(linked_sheet_number) is not None): linked_sheet_title = transform_xls_cell_to_str( book, sheet, merged_cells_tree, row_index, column_index + 1) if linked_sheet_title is not None: linked_sheet_title = linked_sheet_title.strip( ) if linked_sheet_title: hyperlink = get_hyperlink( sheet, row_index, column_index + 1) if hyperlink is not None and hyperlink.type == u'workbook': linked_sheet_name = hyperlink.textmark.split( u'!', 1)[0].strip(u'"').strip(u"'") sheet_title_by_slug = sheet_title_by_slug_by_heading.setdefault( current_heading, collections.OrderedDict()) sheet_title_by_slug[strings.slugify( linked_sheet_name )] = linked_sheet_title if french: sheet_title_by_name[ linked_sheet_name] = linked_sheet_title else: sheet_english_title_by_name[ linked_sheet_name] = linked_sheet_title if sheet_title_by_slug_by_heading: break assert sheet_title_by_slug_by_heading book_notes = [] for column_index in range(8, 12): for row_index in range(sheet.nrows): note = transform_xls_cell_to_str( book, sheet, merged_cells_tree, row_index, column_index) if note and note.strip() == book_description: continue if book_notes or note: book_notes.append((note or u'').rstrip()) if note: blank_notes_count = 0 elif blank_notes_count >= 1: break else: blank_notes_count += 1 if book_notes: break while book_notes and not book_notes[-1]: del book_notes[-1] assert book_notes sheet_node = collections.OrderedDict(( (u'Titre' if french else u'Title', book_title), (u'Description' if french else u'Description', book_description), (u'Sommaire' if french else u'Table of Content', sheet_title_by_slug_by_heading), (u'Notes' if french else u'Notes', literal_unicode(u'\n'.join(book_notes))), (u'Données initiales' if french else u'Source Data', collections.OrderedDict(( (u'Producteur' if french else u'Producer', u'Institut des politiques publiques'), (u'Format', u'XLS'), (u'URL', u'http://www.ipp.eu/outils/baremes-ipp/' if french else u'http://www.ipp.eu/en/tools/ipp-tax-and-benefit-tables/' ), ))), ( u'Convertisseur' if french else u'Converter', collections.OrderedDict(( (u'URL', u'https://git.framasoft.org/french-tax-and-benefit-tables/ipp-tax-and-benefit-tables-converters' ), # noqa ))), ( u'Données générées' if french else u'Generated Data', collections.OrderedDict(( (u'Format', u'YAML'), (u'URL', u'https://git.framasoft.org/french-tax-and-benefit-tables/ipp-tax-and-benefit-tables-yaml-raw' ), # noqa ))), (u'Licence' if french else u'License', u'Licence ouverte <http://www.etalab.gouv.fr/licence-ouverte-open-licence>' if french else u'Open Licence <http://www.etalab.gouv.fr/licence-ouverte-open-licence>' ), )) yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, (strings.slugify(sheet_name, transform=strings.upper) + u'.yaml').encode(file_system_encoding), ) elif sheet_name.startswith(u'Abréviation'): log.warning(u' Ignoring sheet {} of book {}.'.format( sheet_name, book_name)) sheet_warning = u'Sheet ignored.' sheet_title = sheet_title_by_name.get( sheet_name, sheet_name) sheet_node = collections.OrderedDict( ((u'Titre' if french else u'Title', sheet_title), )) yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, (strings.slugify(sheet_name, transform=strings.upper) + u'.yaml').encode(file_system_encoding), ) else: descriptions_rows = [] labels_rows = [] notes_rows = [] state = 'taxipp_names' taxipp_names_row = None values_rows = [] for row_index in range(sheet.nrows): columns_count = len(sheet.row_values(row_index)) if state == 'taxipp_names': taxipp_names_row = [ (taxipp_name or u'').strip() for taxipp_name in ( transform_xls_cell_to_str( book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count)) ] state = 'labels' if all(not taxipp_name for taxipp_name in taxipp_names_row): # The first row is empty => This sheet doesn't contain TaxIPP names. continue # When any TaxIPP name is in lowercase, assume that this row is really the TaxIPP names row. if any(taxipp_name and taxipp_name[0].islower() for taxipp_name in taxipp_names_row): continue else: log.info( u' Sheet "{}" of XLS file "{}" has no row for TaxIPP names.' .format(sheet_name, filename)) # warning = u'Row not found' # if sheet_warning is None: # sheet_warning = collections.OrderedDict() # if isinstance(sheet_warning, dict): # sheet_warning[u'Noms TaxIPP'] = warning # else: # assert isinstance(sheet_warning, basestring), sheet_warning # sheet_warning = u'\n\n'.join( # fragment # for fragment in (sheet_warning, warning) # if fragment # ) taxipp_names_row = [] if state == 'labels': first_cell_value, error = conv.pipe( cell_to_row_first_cell, conv.not_none)(transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, 0), state=conv.default_state) if error is not None: # First cell of row is not a the first cell of a row of values => Assume it is a label. labels_rows.append([ u' '.join((label or u'').split()).strip() for label in (transform_xls_cell_to_str( book, sheet, merged_cells_tree, row_index, column_index) for column_index in range( columns_count)) ]) continue state = 'values' if state == 'values': first_cell_value, error = cell_to_row_first_cell( transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, 0), state=conv.default_state) if error is None: # First cell of row is a valid date or year. values_row = [ value.strip() if isinstance( value, basestring) else value for value in (transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, column_index, empty_white_value=u'nc') for column_index in range( columns_count)) ] if isinstance(first_cell_value, datetime.date): assert first_cell_value.year < 2601, 'Invalid date {} in {} at row {}'.format( first_cell_value, sheet_name, row_index + 1) values_rows.append(values_row) continue if isinstance(first_cell_value, basestring) \ and aad_re.match(first_cell_value) is not None: values_rows.append(values_row) continue if all(value in (None, u'', u'nc') for value in values_row): # If first cell is empty and all other cells in line are also empty, ignore this # line. continue # First cell has no date and other cells in row are not empty => Assume it is a note. state = 'notes' if state == 'notes': first_cell_value = transform_xls_cell_to_json( book, sheet, merged_cells_tree, row_index, 0) if isinstance( first_cell_value, basestring ) and first_cell_value.strip().lower() == 'notes': notes_rows.append([ (line or u'').rstrip() for line in (transform_xls_cell_to_str( book, sheet, merged_cells_tree, row_index, column_index) for column_index in range( columns_count)) ]) continue state = 'description' assert state == 'description' descriptions_rows.append([ (line or u'').strip() for line in ( transform_xls_cell_to_str( book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count)) ]) sheet_node = collections.OrderedDict() sheet_title = sheet_title_by_name.get(sheet_name) if sheet_title is not None: sheet_node[u'Titre'] = sheet_title sheet_node[u'Titre court'] = sheet_name labels = [] for labels_row in labels_rows: for column_index, label in enumerate(labels_row): if label is None: continue label = label.strip() if not label: continue while column_index >= len(labels): labels.append([]) column_labels = labels[column_index] if not column_labels or column_labels[-1] != label: column_labels.append(label) labels = [ (tuple( label_stripped for label_stripped in ((label or u'').strip() for label in column_labels1) if label_stripped) if column_labels1 else None) or (u'Colonne sans titre', ) for index, column_labels1 in enumerate(labels, 1) ] assert labels taxipp_name_by_column_labels = collections.OrderedDict() for column_labels, taxipp_name in zip( labels, taxipp_names_row): if not taxipp_name: continue taxipp_name_by_column_label = taxipp_name_by_column_labels for column_label in column_labels[:-1]: taxipp_name_by_column_label = taxipp_name_by_column_label.setdefault( column_label, collections.OrderedDict()) taxipp_name_by_column_label[ column_labels[-1]] = taxipp_name if taxipp_name_by_column_labels: sheet_node[ u'Noms TaxIPP'] = taxipp_name_by_column_labels sheet_values = [] for value_row in values_rows: cell_by_column_labels = collections.OrderedDict() for column_labels, cell in zip(labels, value_row): if cell is None or cell == '': continue cell_by_column_label = cell_by_column_labels for column_label in column_labels[:-1]: cell_by_column_label = cell_by_column_label.setdefault( column_label, collections.OrderedDict()) # Merge (amount, unit) couples to a string to simplify YAML. if isinstance(cell, tuple): cell = transform_amount_tuple_to_str(cell) if isinstance(cell, basestring) and u'\n' in cell: cell = literal_unicode(cell) cell_by_column_label[column_labels[-1]] = cell sheet_values.append(cell_by_column_labels) if sheet_values: sheet_node[u'Valeurs'] = sheet_values notes = u'\n'.join([ line.rstrip() for line in u'\n'.join([ u' | '.join(cell for cell in row if cell).rstrip() for row in notes_rows ]).split(u'\n') ]).rstrip() if notes: sheet_node[u'Notes'] = literal_unicode(notes) description = u'\n'.join([ line.rstrip() for line in u'\n'.join([ u' | '.join(cell for cell in row if cell).rstrip() for row in descriptions_rows ]).split(u'\n') ]).rstrip() if description: sheet_node[u'Description'] = literal_unicode( description) yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, (strings.slugify(sheet_name) + u'.yaml').encode(file_system_encoding), ) if sheet_error: sheet_node[u'ERRORS'] = literal_unicode(sheet_error) \ if isinstance(sheet_error, basestring) and u'\n' in sheet_error \ else sheet_error if sheet_warning: sheet_node[u'WARNINGS'] = literal_unicode(sheet_warning) \ if isinstance(sheet_warning, basestring) and u'\n' in sheet_warning \ else sheet_warning with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(sheet_node, yaml_file, allow_unicode=True, default_flow_style=False, indent=2, width=120) except: message = u'An exception occurred when parsing sheet "{}" of XLS file "{}".'.format( sheet_name, filename) log.exception(u' {}'.format(message)) sheet_error = literal_unicode(u'\n\n'.join( fragment for fragment in ( unicode(sheet_error ) if sheet_error is not None else None, message, traceback.format_exc().decode('utf-8'), ) if fragment)) if sheet_error: error_by_sheet_name[sheet_name] = sheet_error if sheet_warning: warning_by_sheet_name[sheet_name] = sheet_warning if error_by_sheet_name: yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, u'ERRORS.yaml'.encode(file_system_encoding), ) with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(error_by_sheet_name, yaml_file, allow_unicode=True, default_flow_style=False, indent=2, width=120) error_by_book_name[book_name] = error_by_sheet_name if warning_by_sheet_name: yaml_file_path_encoded = os.path.join( book_yaml_dir_encoded, u'WARNINGS.yaml'.encode(file_system_encoding), ) with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(warning_by_sheet_name, yaml_file, allow_unicode=True, default_flow_style=False, indent=2, width=120) warning_by_book_name[book_name] = warning_by_sheet_name if error_by_book_name: yaml_file_path_encoded = os.path.join( yaml_raw_dir, u'ERRORS.yaml'.encode(file_system_encoding), ) with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(error_by_book_name, yaml_file, allow_unicode=True, default_flow_style=False, indent=2, width=120) if warning_by_book_name: yaml_file_path_encoded = os.path.join( yaml_raw_dir, u'WARNINGS.yaml'.encode(file_system_encoding), ) with open(yaml_file_path_encoded, 'w') as yaml_file: yaml.dump(warning_by_book_name, yaml_file, allow_unicode=True, default_flow_style=False, indent=2, width=120)
def login(req): """Authorization request.""" ctx = contexts.Ctx(req) params = req.POST inputs = dict( assertion = params.get('assertion'), ) data, errors = conv.struct( dict( assertion = conv.pipe( conv.cleanup_line, conv.not_none, ), ), )(inputs, state = ctx) if errors is not None: return wsgihelpers.bad_request(ctx, explanation = ctx._(u'Login Error: {0}').format(errors)) response = requests.post('https://verifier.login.persona.org/verify', data = dict( audience = urls.get_full_url(ctx), assertion = data['assertion'], ), verify = True, ) if not response.ok: return wsgihelpers.internal_error(ctx, dump = response.text, explanation = ctx._(u'Error while verifying authentication assertion'), ) verification_data = json.loads(response.content) # Check if the assertion was valid. if verification_data['status'] != 'okay': return wsgihelpers.internal_error(ctx, dump = response.text, explanation = ctx._(u'Error while verifying authentication assertion'), ) registered_account = model.Account.find_one( dict( email = verification_data['email'], ), as_class = collections.OrderedDict, ) session = ctx.session if session is None: ctx.session = session = model.Session() session.expiration = datetime.datetime.utcnow() + datetime.timedelta(hours = 4) if registered_account is None: user = session.user if user is None: user = model.Account() user.api_key = uuidhelpers.generate_uuid() user.email = verification_data['email'] user.full_name = verification_data['email'] user.slug = strings.slugify(user.full_name) user.compute_words() user.save(safe = True) session.user = user else: session.user = registered_account session.anonymous_token = uuidhelpers.generate_uuid() session.token = uuidhelpers.generate_uuid() session.save(safe = True) req.response.set_cookie(conf['cookie'], session.token, httponly = True, secure = req.scheme == 'https') return wsgihelpers.no_content(ctx)
def login(req): """Authorization request.""" ctx = contexts.Ctx(req) params = req.POST inputs = dict(assertion=params.get('assertion'), ) data, errors = conv.struct( dict(assertion=conv.pipe( conv.cleanup_line, conv.not_none, ), ), )(inputs, state=ctx) if errors is not None: return wsgihelpers.bad_request( ctx, explanation=ctx._(u'Login Error: {0}').format(errors)) response = requests.post( 'https://verifier.login.persona.org/verify', data=dict( audience=urls.get_full_url(ctx), assertion=data['assertion'], ), verify=True, ) if not response.ok: return wsgihelpers.internal_error( ctx, dump=response.text, explanation=ctx._( u'Error while verifying authentication assertion'), ) verification_data = json.loads(response.content) # Check if the assertion was valid. if verification_data['status'] != 'okay': return wsgihelpers.internal_error( ctx, dump=response.text, explanation=ctx._( u'Error while verifying authentication assertion'), ) registered_account = model.Account.find_one( dict(email=verification_data['email'], ), as_class=collections.OrderedDict, ) session = ctx.session if session is None: ctx.session = session = model.Session() session.expiration = datetime.datetime.utcnow() + datetime.timedelta( hours=4) if registered_account is None: user = session.user if user is None: user = model.Account() user.api_key = uuidhelpers.generate_uuid() user.email = verification_data['email'] user.full_name = verification_data['email'] user.slug = strings.slugify(user.full_name) user.compute_words() user.save(safe=True) session.user = user else: session.user = registered_account session.anonymous_token = uuidhelpers.generate_uuid() session.token = uuidhelpers.generate_uuid() session.save(safe=True) req.response.set_cookie(conf['cookie'], session.token, httponly=True, secure=req.scheme == 'https') return wsgihelpers.no_content(ctx)
'Compensated Own-Price and Cross-Price Elasticities -- Modest and aged more than 60' ), ] cross_price_elasticities = pandas.DataFrame() for table in tables: age = table.pop('age') name = table.pop('name') revenus = table.pop('revenus') df = pandas.read_excel(elasticities_origin_xlsx, **table) df.dropna(inplace=True) df.set_index('Unnamed: 0', inplace=True) df.index.name = 'product' if age is None or revenus is None: df.name = name csv_path_name = os.path.join(elasticities_path, slugify(name) + '.csv') df.to_csv(csv_path_name) else: df['age'] = age df['revenus'] = revenus cross_price_elasticities = cross_price_elasticities.append(df) csv_path_name = os.path.join( elasticities_path, 'cross_price_elasticities.csv', ) cross_price_elasticities.to_csv(csv_path_name)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--ipp-translations', default = os.path.join(param_dir, 'ipp-tax-and-benefit-tables-to-parameters.yaml'), help = 'path of YAML file containing the association between IPP fields and OpenFisca parameters') parser.add_argument('-o', '--origin', default = os.path.join(param_dir, 'param.xml'), help = 'path of XML file containing the original OpenFisca parameters') parser.add_argument('-p', '--param-translations', default = os.path.join(param_dir, 'param-to-parameters.yaml'), help = 'path of YAML file containing the association between param elements and OpenFisca parameters') parser.add_argument('-s', '--source-dir', default = 'yaml-clean', help = 'path of source directory containing clean IPP YAML files') parser.add_argument('-t', '--target', default = os.path.join(package_dir, 'parameters'), help = 'path of generated directory of XML files merging IPP fields with OpenFisca parameters') parser.add_argument('-v', '--verbose', action = 'store_true', default = False, help = "increase output verbosity") args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) assert os.path.isdir(args.source_dir), args.source_dir file_system_encoding = sys.getfilesystemencoding() original_element_tree = etree.parse(args.origin) original_root_element = original_element_tree.getroot() # Apply translations to original parameters. with open(args.param_translations) as param_translations_file: param_translations = yaml.load(param_translations_file) for old_path, new_path in param_translations.iteritems(): parent_element = None element = original_root_element for name in old_path.split('.'): for child in element: if child.get('code') == name: parent_element = element element = child break else: assert False, 'Path "{}" not found in "{}"'.format(old_path, args.origin) parent_element.remove(element) if new_path is not None: parent_element = original_root_element split_new_path = new_path.split('.') for name in split_new_path[:-1]: for child in parent_element: if child.get('code') == name: parent_element = child break else: parent_element = etree.SubElement(parent_element, 'NODE', attrib = dict( code = name, )) name = split_new_path[-1] assert all( child.get('code') != name for child in parent_element ), 'Path "{}" already exists in "{}"'.format(new_path, args.origin) element.set('code', name) parent_element.append(element) # Build `tree` from IPP YAML files. tree = collections.OrderedDict() for source_dir_encoded, directories_name_encoded, filenames_encoded in os.walk(args.source_dir): directories_name_encoded.sort() for filename_encoded in sorted(filenames_encoded): if not filename_encoded.endswith('.yaml'): continue filename = filename_encoded.decode(file_system_encoding) sheet_name = os.path.splitext(filename)[0] source_file_path_encoded = os.path.join(source_dir_encoded, filename_encoded) relative_file_path_encoded = source_file_path_encoded[len(args.source_dir):].lstrip(os.sep) relative_file_path = relative_file_path_encoded.decode(file_system_encoding) if sheet_name.isupper(): continue assert sheet_name.islower(), sheet_name log.info(u'Loading file {}'.format(relative_file_path)) with open(source_file_path_encoded) as source_file: data = yaml.load(source_file) rows = data.get(u"Valeurs") if rows is None: log.info(u' Skipping file {} without "Valeurs"'.format(relative_file_path)) continue row_by_start = {} for row in rows: start = row.get(u"Date d'effet") if start is None: for date_name in date_names: start = row.get(date_name) if start is not None: break else: # No date found. Skip row. continue elif not isinstance(start, datetime.date): start = start[u"Année Revenus"] row_by_start[start] = row sorted_row_by_start = sorted(row_by_start.iteritems()) relative_ipp_paths_by_start = {} unsorted_relative_ipp_paths = set() for start, row in sorted_row_by_start: relative_ipp_paths_by_start[start] = start_relative_ipp_paths = [] for name, child in row.iteritems(): if name in date_names: continue if name in note_names: continue if name in reference_names: continue start_relative_ipp_paths.extend( (name,) + tuple(path) for path, value in iter_ipp_values(child) ) unsorted_relative_ipp_paths.update(start_relative_ipp_paths) def compare_relative_ipp_paths(x, y): if x == y: return 0 for relative_ipp_paths in relative_ipp_paths_by_start.itervalues(): try: return cmp(relative_ipp_paths.index(x), relative_ipp_paths.index(y)) except ValueError: # Either x or y paths are missing in relative_ipp_paths => Their order can't be compared. continue return -1 sorted_relative_ipp_paths = sorted(unsorted_relative_ipp_paths, cmp = compare_relative_ipp_paths) # tax_rate_tree_by_bracket_type = {} for start, row in sorted_row_by_start: for relative_ipp_path in sorted_relative_ipp_paths: value = row for fragment in relative_ipp_path: value = value.get(fragment) if value is None: break if value in (u'-', u'na', u'nc'): # Value is unknown. Previous value must be propagated. continue ipp_path = [ fragment if fragment in ('RENAME', 'TRANCHE', 'TYPE') else strings.slugify(fragment, separator = u'_') for fragment in itertools.chain( relative_file_path.split(os.sep)[:-1], [sheet_name], relative_ipp_path, ) ] sub_tree = tree for fragment in ipp_path[:-1]: sub_tree = sub_tree.setdefault(fragment, collections.OrderedDict()) fragment = ipp_path[-1] sub_tree = sub_tree.setdefault(fragment, []) if sub_tree: last_leaf = sub_tree[-1] if last_leaf['value'] == value: continue last_leaf['stop'] = start - datetime.timedelta(days = 1) sub_tree.append(dict( start = start, value = value, )) ipp_tax_and_benefit_tables_to_parameters.transform_ipp_tree(tree) root_element = transform_node_to_element(u'root', tree) add_origin_openfisca_attrib(original_root_element) merge_elements(root_element, original_root_element) # Since now `original_root_element` is discarded. if os.path.exists(args.target): for xml_file_path in glob.glob(os.path.join(args.target, '*.xml')): os.remove(xml_file_path) else: os.mkdir(args.target) for child_element in root_element[:]: root_element.remove(child_element) element_tree = etree.ElementTree(child_element) sort_elements(child_element) reindent(child_element) element_tree.write(os.path.join(args.target, '{}.xml'.format(child_element.attrib['code'])), encoding = 'utf-8') element_tree = etree.ElementTree(root_element) reindent(root_element) element_tree.write(os.path.join(args.target, '__root__.xml'), encoding = 'utf-8') return 0
def iter_ids(cls, ctx, categories_slug = None, competence_territories_id = None, competence_type = None, presence_territory = None, term = None): intersected_sets = [] if competence_territories_id is not None: competence_territories_sets = [] if competence_type in (None, 'by_territory'): competence_territories_sets.extend( cls.ids_by_competence_territory_id.get(competence_territory_id) for competence_territory_id in competence_territories_id ) if competence_type in (None, 'by_nature'): competence_territories_sets.append(cls.ids_by_competence_territory_id.get(None)) territory_competent_pois_id = ramdb.union_set(competence_territories_sets) if not territory_competent_pois_id: return set() intersected_sets.append(territory_competent_pois_id) if presence_territory is not None: territory_present_pois_id = cls.ids_by_presence_territory_id.get(presence_territory._id) if not territory_present_pois_id: return set() intersected_sets.append(territory_present_pois_id) if ctx.base_categories_slug is not None: base_categories_sets = [] base_categories_slug = copy(ctx.base_categories_slug or []) for category_slug in set(base_categories_slug or []): if category_slug is not None: category_pois_id = cls.ids_by_category_slug.get(category_slug) if category_pois_id: base_categories_sets.append(category_pois_id) intersected_sets.append(ramdb.union_set(base_categories_sets)) for category_slug in set(categories_slug or []): if category_slug is not None: category_pois_id = cls.ids_by_category_slug.get(category_slug) if not category_pois_id: return set() intersected_sets.append(category_pois_id) if conf['index.date.field']: current_datetime = datetime.datetime.utcnow() ids_by_begin_datetime_set = set() for poi_begin_datetime, poi_id in cls.ids_by_begin_datetime: if poi_begin_datetime is None or current_datetime >= poi_begin_datetime: ids_by_begin_datetime_set.add(poi_id) else: break ids_by_end_datetime_set = set() for poi_end_datetime, poi_id in cls.ids_by_end_datetime: if poi_end_datetime is None or current_datetime <= poi_end_datetime: ids_by_end_datetime_set.add(poi_id) else: break intersected_sets.append(ramdb.intersection_set([ids_by_begin_datetime_set, ids_by_end_datetime_set])) # We should filter on term *after* having looked for competent organizations. Otherwise, when no organization # matching term is found, the nearest organizations will be used even when there are competent organizations # (that don't match the term). if term: prefixes = strings.slugify(term).split(u'-') pois_id_by_prefix = {} for prefix in prefixes: if prefix in pois_id_by_prefix: # TODO? Handle pois with several words sharing the same prefix? continue pois_id_by_prefix[prefix] = ramdb.union_set( pois_id for word, pois_id in cls.ids_by_word.iteritems() if word.startswith(prefix) ) or set() intersected_sets.extend(pois_id_by_prefix.itervalues()) found_pois_id = ramdb.intersection_set(intersected_sets) if found_pois_id is None: return cls.indexed_ids return found_pois_id
def is_multimodal_info_service(self): for field in self.fields: if field.id == 'boolean' and strings.slugify(field.label) == 'service-d-information-multimodale': return conv.check(conv.guess_bool(field.value))
def load(cls, poi_bson): metadata = poi_bson['metadata'] last_update = metadata['last-update'] if poi_bson.get('geo') is None: geo = None else: geo = poi_bson['geo'][0] if len(geo) > 2 and geo[2] == 0: # Don't use geographical coordinates with a 0 accuracy because their coordinates may be None. geo = None self = cls( _id = poi_bson['_id'], geo = geo, last_update_datetime = last_update['date'], last_update_organization = last_update['organization'], name = metadata['title'], schema_name = metadata['schema-name'], ) if conf['theme_field'] is None: theme_field_id = None theme_field_name = None else: theme_field_id = conf['theme_field']['id'] theme_field_name = conf['theme_field'].get('name') fields_position = {} fields = [] for field_id in metadata['positions']: field_position = fields_position.get(field_id, 0) fields_position[field_id] = field_position + 1 field_metadata = metadata[field_id][field_position] field_value = poi_bson[field_id][field_position] field = Field.load(field_id, field_metadata, field_value) if field.id == u'adr' and self.postal_distribution_str is None: for sub_field in (field.value or []): if sub_field.id == u'postal-distribution': self.postal_distribution_str = sub_field.value elif sub_field.id == u'street-address': self.street_address = sub_field.value elif field.id == u'link' and field.relation == u'parent': assert self.parent is None, str(self) self.parent_id = field.value if field_id == theme_field_id and ( theme_field_name is None or theme_field_name == strings.slugify(field.label)): if field.id == u'organism-type': organism_type_slug = ramdb.category_slug_by_pivot_code.get(field.value) if organism_type_slug is None: log.warning('Ignoring organism type "{0}" without matching category.'.format(field.value)) else: self.theme_slug = organism_type_slug else: theme_slug = strings.slugify(field.value) if theme_slug in ramdb.category_by_slug: self.theme_slug = theme_slug else: log.warning('Ignoring theme "{0}" without matching category.'.format(field.value)) fields.append(field) if fields: self.fields = fields # Temporarily store bson in poi because it is needed by index_pois. self.bson = poi_bson cls.instance_by_id[self._id] = self if self.parent_id is not None: cls.ids_by_parent_id.setdefault(self.parent_id, set()).add(self._id) return self
def __init__(self, **attributes): super(TestCase, self).__init__(**attributes) if self.title is None: self.title = babel.dates.format_datetime(datetime.datetime.utcnow()) self.slug = strings.slugify(self.title)
def main(path, date, option = 'all_months', month = 1): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dir', default = path + date, help = 'path of IPP XLS directory') parser.add_argument('-v', '--verbose', action = 'store_true', default = False, help = "increase output verbosity") args = parser.parse_args() # args.dir = path logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) forbiden_sheets = { u'Impot Revenu': (u'Barème IGR',), u'prelevements sociaux': (u'Abréviations', u'ASSIETTE PU', u'AUBRYI', u'AUBRYII'), u'Taxation indirecte': (u'TVA par produit',), } baremes = [u'Prestations', u'Chomage', u'Impot Revenu', u'prelevements sociaux', u'Taxation indirecte', u'Taxation du capital', u'Taxes locales', u'Marche du travail'] for bareme in baremes: log.info(u'Parsing file {}'.format(bareme)) xls_path = os.path.join(args.dir.decode('utf-8'), u"Baremes IPP - {0}.xls".format(bareme)) # xls_path = os.path.join(path, u"Baremes IPP - {0}.xls".format(bareme)) book = xlrd.open_workbook(filename = xls_path, formatting_info = True) sheet_names = [ sheet_name for sheet_name in book.sheet_names() if not sheet_name.startswith((u'Sommaire', u'Outline')) and not sheet_name in forbiden_sheets.get(bareme, []) ] vector_by_taxipp_name = {} for sheet_name in sheet_names: log.info(u' Parsing sheet {}'.format(sheet_name)) sheet = book.sheet_by_name(sheet_name) # Extract coordinates of merged cells. merged_cells_tree = {} for row_low, row_high, column_low, column_high in sheet.merged_cells: for row_index in range(row_low, row_high): cell_coordinates_by_merged_column_index = merged_cells_tree.setdefault( row_index, {}) for column_index in range(column_low, column_high): cell_coordinates_by_merged_column_index[column_index] = (row_low, column_low) descriptions_rows = [] labels_rows = [] notes_rows = [] state = 'taxipp_names' taxipp_names_row = None values_rows = [] for row_index in range(sheet.nrows): ncols = len(sheet.row_values(row_index)) if state == 'taxipp_names': taxipp_names_row = [ taxipp_name for taxipp_name in ( transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(ncols) ) ] state = 'labels' continue if state == 'labels': first_cell_value = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0) date_or_year, error = conv.pipe( conv.test_isinstance((int, basestring)), cell_to_date_or_year, conv.not_none, )(first_cell_value, state = conv.default_state) if error is not None: # First cell of row is not a date => Assume it is a label. labels_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(ncols) ]) continue state = 'values' if state == 'values': first_cell_value = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0) if first_cell_value is None or isinstance(first_cell_value, (int, basestring)): date_or_year, error = cell_to_date_or_year(first_cell_value, state = conv.default_state) if error is None: # First cell of row is a valid date or year. values_row = [ transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(ncols) ] if date_or_year is not None: assert date_or_year.year < 2601, 'Invalid date {} in {} at row {}'.format(date_or_year, sheet_name, row_index + 1) values_rows.append(values_row) continue if all(value in (None, u'') for value in values_row): # If first cell is empty and all other cells in line are also empty, ignore this line. continue # First cell has no date and other cells in row are not empty => Assume it is a note. state = 'notes' if state == 'notes': first_cell_value = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0) if isinstance(first_cell_value, basestring) and first_cell_value.strip().lower() == 'notes': notes_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(ncols) ]) continue state = 'description' assert state == 'description' descriptions_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(ncols) ]) dates = [ conv.check(cell_to_date_or_year)( row[1] if bareme == u'Impot Revenu' else row[0], state = conv.default_state, ).replace(day = 1) for row in values_rows ] for column_index, taxipp_name in enumerate(taxipp_names_row): if taxipp_name and strings.slugify(taxipp_name) not in ('date', 'date-ir', 'date-rev', 'note', 'ref-leg', 'notes') : vector = [ transform_cell_value(date, row[column_index]) for date, row in zip(dates, values_rows) ] vector = [ cell if not isinstance(cell, basestring) or cell == u'nc' else '-' for cell in vector ] vector_by_taxipp_name[taxipp_name] = pd.Series(vector, index = dates) monthstime = [ datetime.datetime(y, m, 1,0,0,0) for y in range(1914, 2021) for m in range(1, 13) ] data_frame = pd.DataFrame(index = monthstime) for taxipp_name, vector in vector_by_taxipp_name.iteritems(): data_frame[taxipp_name] = np.nan data_frame.loc[vector.index.values, taxipp_name] = vector.values data_frame.replace(u'nc', np.nan, inplace=True) data_frame.fillna(method = 'pad', inplace = True) data_frame.dropna(axis = 0, how = 'all', inplace = True) if option == 'mean_by_year': data_frame.replace('-', 0, inplace=True) data_frame = data_frame.resample('AS', how='mean') if option == 'which_month_in_year': data_frame = data_frame.iloc[data_frame.index.month == month] data_frame.to_csv(args.dir + "/" + bareme + '.csv', encoding = 'utf-8') print u"Voilà, la table agrégée de {} est créée !".format(bareme) return 0
def index(self, indexed_poi_id): poi_bson = self.bson metadata = poi_bson['metadata'] for category_slug in (metadata.get('categories-index') or set()): self.ids_by_category_slug.setdefault(category_slug, set()).add(indexed_poi_id) if conf['index.date.field']: for date_range_index, date_range_metadata in enumerate(metadata.get('date-range') or []): if date_range_metadata['label'] == conf['index.date.field']: date_range_values = poi_bson['date-range'][date_range_index] date_range_begin = date_range_values.get('date-range-begin', [None])[0] date_range_end = date_range_values.get('date-range-end', [None])[0] if date_range_begin is not None: for index, (begin_datetime, poi_id) in enumerate(self.ids_by_begin_datetime): if begin_datetime is not None and begin_datetime > date_range_begin: break else: index = 0 self.ids_by_begin_datetime.insert(index, (date_range_begin, indexed_poi_id)) if date_range_end is not None: for index, (end_datetime, poi_id) in enumerate(self.ids_by_end_datetime): if end_datetime is not None and end_datetime < date_range_end: break else: index = 0 self.ids_by_end_datetime.insert(index, (date_range_end, indexed_poi_id)) if not metadata.get('date-range'): self.ids_by_begin_datetime.append((None, indexed_poi_id)) self.ids_by_end_datetime.append((None, indexed_poi_id)) self.ids_by_last_update_datetime.append((self.last_update_datetime, indexed_poi_id)) for i, territory_metadata in enumerate(metadata.get('territories') or []): # Note: Don't fail when territory doesn't exist, because Passim can be configured to ignore some kinds # of territories (cf conf['territories_kinds']). self.territories_id = set( territory_id for territory_id in ( ramdb.territory_id_by_kind_code.get((territory_kind_code['kind'], territory_kind_code['code'])) for territory_kind_code in poi_bson['territories'][i] ) if territory_id is not None ) for territory_id in self.territories_id: self.ids_by_territory_id.setdefault(territory_id, set()).add(indexed_poi_id) break if not self.territories_id: self.ids_by_territory_id.setdefault(None, set()).add(indexed_poi_id) poi_territories_id = set( territory_id for territory_id in ( ramdb.territory_id_by_kind_code.get((territory_kind_code['kind'], territory_kind_code['code'])) for territory_kind_code in metadata['territories-index'] if territory_kind_code['kind'] not in (u'Country', u'InternationalOrganization') ) if territory_id is not None ) if metadata.get('territories-index') is not None else None for territory_id in (poi_territories_id or set()): self.ids_by_presence_territory_id.setdefault(territory_id, set()).add(indexed_poi_id) for word in strings.slugify(self.name).split(u'-'): self.ids_by_word.setdefault(word, set()).add(indexed_poi_id) self.slug_by_id[indexed_poi_id] = strings.slugify(self.name) if self.schema_name == 'OffreTransport': if not self.territories_id and not self.instance_by_id[indexed_poi_id].territories_id: france_id = ramdb.territory_id_by_kind_code[(u'Country', u'FR')] self.territories_id = set([france_id]) self.ids_by_territory_id.setdefault(france_id, set()).add(indexed_poi_id) for field in self.fields: field_slug = strings.slugify(field.label) if field.id == 'checkboxes': if field_slug == 'mode-de-transport' and field.value is not None: for transport_mode in field.value: self.ids_by_transport_mode.setdefault(transport_mode, set()).add( indexed_poi_id) if field.id == 'select' and field_slug == 'type-de-transport' and field.value is not None: self.ids_by_transport_type.setdefault(field.value, set()).add(indexed_poi_id) if self.schema_name == 'ServiceInfo': for field in self.fields: if field.id == 'select': if strings.slugify(field.label) == 'niveau' and field.value is not None: coverage_slug = strings.slugify(field.value) self.ids_by_coverage.setdefault(coverage_slug, set()).add(indexed_poi_id) if self.is_multimodal_info_service(): for territory_id in poi_territories_id: self.sim_ids_by_territory_id.setdefault(territory_id, set()).add(indexed_poi_id) self.ids_by_schema_name.setdefault(self.schema_name, set()).add(indexed_poi_id)
def user_extract(req): ctx = contexts.Ctx(req) user = model.get_user(ctx, check=True) if user.email is None: return wsgihelpers.forbidden(ctx) legislation = ctx.node if legislation.is_owner(ctx) and legislation.is_dated: return wsgihelpers.bad_request( ctx, explanation=ctx._(u'This legislation is already dated.')) params = req.GET inputs = { 'date': params.get('date'), } data, errors = conv.struct({ 'date': conv.pipe( conv.french_formatted_str_to_datetime, conv.default(datetime.datetime.utcnow()), ), })(inputs, state=ctx) if errors is not None: return wsgihelpers.bad_request(ctx, explanation=errors) new_legislation = None new_legislation_title = ctx._(u'{} (copy {})').format( legislation.title, user.email) new_legislation_slug = strings.slugify(new_legislation_title) existing_legislations_cursor = model.Legislation.find( dict(slug=new_legislation_slug, ), as_class=collections.OrderedDict, ) if existing_legislations_cursor.count() > 0: for existing_legislation in existing_legislations_cursor: if existing_legislation.is_owner(ctx): return wsgihelpers.redirect( ctx, location=existing_legislation.get_user_url(ctx)) if new_legislation is None: return wsgihelpers.bad_request( ctx, explanation=ctx._( u'A legislation with the same name already exists.'), ) else: new_legislation = model.Legislation( author_id=user._id, datetime_begin=legislation.datetime_begin, datetime_end=legislation.datetime_end, description=ctx._(u'Copy of legislation "{}"').format( legislation.title), title=new_legislation_title, slug=new_legislation_slug, ) response = requests.post( conf['api.urls.legislations'], headers={ 'Content-Type': 'application/json', 'User-Agent': conf['app_name'], }, data=json.dumps( dict(date=data['date'].isoformat(), legislation=legislation.json)), ) new_legislation.json = response.json( object_pairs_hook=collections.OrderedDict).get('dated_legislation') new_legislation.save(safe=True) return wsgihelpers.redirect(ctx, location=new_legislation.get_user_url(ctx))
name = 'Compensated Own-Price and Cross-Price Elasticities -- Modest and aged more than 60' ), ] cross_price_elasticities = pandas.DataFrame() for table in tables: age = table.pop('age') name = table.pop('name') revenus = table.pop('revenus') df = pandas.read_excel(elasticities_origin_xlsx, **table) df.dropna(inplace = True) df.set_index('Unnamed: 0', inplace = True) df.index.name = 'product' if age is None or revenus is None: df.name = name csv_path_name = os.path.join(elasticities_path, slugify(name) + '.csv') df.to_csv(csv_path_name) else: df['age'] = age df['revenus'] = revenus cross_price_elasticities = cross_price_elasticities.append(df) csv_path_name = os.path.join( elasticities_path, 'cross_price_elasticities.csv', ) cross_price_elasticities.to_csv(csv_path_name)
def load(): """Load MongoDB data into RAM-based database.""" from . import model start_time = datetime.datetime.utcnow() global last_timestamp # Remove a few seconds, for data changes that occur during startup. last_timestamp = start_time - datetime.timedelta(seconds = 30) categories_slug_by_tag_slug.clear() categories_slug_by_word.clear() category_by_slug.clear() category_slug_by_pivot_code.clear() for db in model.dbs: for category_bson in db[conf['categories_collection']].find(None, ['code', 'tags_code', 'title']): if not strings.slugify(category_bson.get('title')): continue category = model.Category.load(category_bson) category.index() for db in model.dbs: for organism_type_bson in db[conf['organism_types_collection']].find(None, ['code', 'slug']): if organism_type_bson['slug'] not in category_by_slug: log.warning( 'Ignoring organism type "{0}" without matching category.'.format(organism_type_bson['code']) ) continue category_slug_by_pivot_code[organism_type_bson['code']] = organism_type_bson['slug'] territories_id_by_ancestor_id.clear() territories_id_by_postal_distribution.clear() territories_query = dict( kind = {'$in': conf['territories_kinds']}, ) territory_by_id.clear() territory_id_by_kind_code.clear() territories_collection = pymongo.Connection()[conf['territories_database']][conf['territories_collection']] territories_fields_list = [ 'ancestors_id', 'code', 'geo', 'hinge_type', 'kind', 'main_postal_distribution', 'name' ] for territory_bson in territories_collection.find(territories_query, territories_fields_list): main_postal_distribution = territory_bson.get('main_postal_distribution') if main_postal_distribution is None: continue territory_class = model.Territory.kind_to_class(territory_bson['kind']) assert territory_class is not None, 'Invalid territory type name: {0}'.format(class_name) territory_id = territory_bson['_id'] territory = territory_class( _id = territory_id, ancestors_id = territory_bson['ancestors_id'], code = territory_bson['code'], geo = territory_bson.get('geo'), hinge_type = territory_bson.get('hinge_type'), main_postal_distribution = main_postal_distribution, name = territory_bson['name'], ) territory_by_id[territory_id] = territory for ancestor_id in territory_bson['ancestors_id']: territories_id_by_ancestor_id.setdefault(ancestor_id, set()).add(territory_id) territory_id_by_kind_code[(territory_bson['kind'], territory_bson['code'])] = territory_id territories_id_by_postal_distribution[( main_postal_distribution['postal_code'], main_postal_distribution['postal_routing'], )] = territory_id schema_title_by_name.clear() for db in model.dbs: for schema in db.schemas.find(None, ['name', 'title']): schema_title_by_name[schema['name']] = schema['title'] model.Poi.clear_indexes() model.Poi.load_pois() model.Poi.index_pois() # # Remove unused categories. # for category_slug in category_by_slug.keys(): # if category_slug not in model.Poi.ids_by_category_slug: # log.warning('Ignoring category "{0}" not used by any POI.'.format(category_slug)) # del category_by_slug[category_slug] # for category_slug in model.Poi.ids_by_category_slug'].keys(): # if category_slug not in category_by_slug: # log.warning('Ignoring category "{0}" not defined in categories collection.'.format(category_slug)) # del model.Poi.ids_by_category_slug[category_slug] ## for category_slug in category_by_slug.iterkeys(): # for word in category_slug.split(u'-'): # categories_slug_by_word.setdefault(word, set()).add(category_slug) log.info('RAM-based database loaded in {0} seconds'.format(datetime.datetime.utcnow() - start_time))
def build_tree_from_yaml_clean(yaml_dir): tree = collections.OrderedDict() for yaml_dir_encoded, _, filenames_encoded in os.walk(yaml_dir): for filename_encoded in sorted(filenames_encoded): if not filename_encoded.endswith('.yaml'): continue filename = filename_encoded.decode(file_system_encoding) sheet_name = os.path.splitext(filename)[0] yaml_file_path_encoded = os.path.join(yaml_dir_encoded, filename_encoded) relative_file_path_encoded = yaml_file_path_encoded[len(yaml_dir ):].lstrip( os.sep) relative_file_path = relative_file_path_encoded.decode( file_system_encoding) if sheet_name.isupper(): continue assert sheet_name.islower(), sheet_name log.info(u'Loading file {}'.format(relative_file_path)) with open(yaml_file_path_encoded) as yaml_file: data = yaml.load(yaml_file) rows = data.get(u"Valeurs") if rows is None: log.info(u' Skipping file {} without "Valeurs"'.format( relative_file_path)) continue row_by_start = {} for row in rows: start = row.get(u"Date d'effet") if start is None: for date_name in date_names: start = row.get(date_name) if start is not None: break else: # No date found. Skip row. continue elif not isinstance(start, datetime.date): start = start[u"Année Revenus"] row_by_start[start] = row sorted_row_by_start = sorted(row_by_start.items()) relative_ipp_paths_by_start = {} unsorted_relative_ipp_paths = set() for start, row in sorted_row_by_start: relative_ipp_paths_by_start[ start] = start_relative_ipp_paths = [] for name, child in row.items(): if name in date_names: continue if name in note_names: continue if name in reference_names: continue start_relative_ipp_paths.extend( (name, ) + tuple(path) for path, value in iter_ipp_values(child)) unsorted_relative_ipp_paths.update(start_relative_ipp_paths) def compare_relative_ipp_paths(x, y): if x == y: return 0 for relative_ipp_paths in relative_ipp_paths_by_start.itervalues( ): try: return cmp(relative_ipp_paths.index(x), relative_ipp_paths.index(y)) except ValueError: # Either x or y paths are missing in relative_ipp_paths => Their order can't be compared. continue return -1 sorted_relative_ipp_paths = sorted(unsorted_relative_ipp_paths, cmp=compare_relative_ipp_paths) # tax_rate_tree_by_bracket_type = {} for start, row in sorted_row_by_start: for relative_ipp_path in sorted_relative_ipp_paths: value = row for fragment in relative_ipp_path: value = value.get(fragment) if value is None: break if value in (u'-', u'na', u'nc'): # Value is unknown. Previous value must be propagated. continue ipp_path = [ fragment if fragment in ('RENAME', 'TRANCHE', 'TYPE') else strings.slugify(fragment, separator=u'_') for fragment in itertools.chain( relative_file_path.split(os.sep)[:-1], [sheet_name], relative_ipp_path, ) ] sub_tree = tree for fragment in ipp_path[:-1]: sub_tree = sub_tree.setdefault( fragment, collections.OrderedDict()) fragment = ipp_path[-1] sub_tree = sub_tree.setdefault(fragment, []) if sub_tree: previous_leaf = sub_tree[-1] if previous_leaf['value'] == value: # Merge leaves with the same value. # One day, when we'll support "Références législatives", this behavior may change. continue sub_tree.append(dict( start=start, value=value, )) return tree
def slugify_ipp_translation_key(key): return key if key in ('RENAME', 'TYPE') else strings.slugify(key, separator = u'_')
def slug(self): return strings.slugify(self.name)
def transform_node_to_element(name, node): if isinstance(node, dict): if node.get("TYPE") == u"BAREME": scale_element = etree.Element("BAREME", attrib=dict(code=strings.slugify(name, separator=u"_"))) for slice_name in node.get("SEUIL", {}).keys(): slice_element = etree.Element("TRANCHE", attrib=dict(code=strings.slugify(slice_name, separator=u"_"))) threshold_element = etree.Element("SEUIL") values, format, type = prepare_xml_values(name, node.get("SEUIL", {}).get(slice_name, [])) for value in values: value_element = transform_value_to_element(value) if value_element is not None: threshold_element.append(value_element) if len(threshold_element) > 0: slice_element.append(threshold_element) amount_element = etree.Element("MONTANT") values, format, type = prepare_xml_values(name, node.get("MONTANT", {}).get(slice_name, [])) for value in values: value_element = transform_value_to_element(value) if value_element is not None: amount_element.append(value_element) if len(amount_element) > 0: slice_element.append(amount_element) rate_element = etree.Element("TAUX") values, format, type = prepare_xml_values(name, node.get("TAUX", {}).get(slice_name, [])) for value in values: value_element = transform_value_to_element(value) if value_element is not None: rate_element.append(value_element) if len(rate_element) > 0: slice_element.append(rate_element) base_element = etree.Element("ASSIETTE") values, format, type = prepare_xml_values(name, node.get("ASSIETTE", {}).get(slice_name, [])) for value in values: value_element = transform_value_to_element(value) if value_element is not None: base_element.append(value_element) if len(base_element) > 0: slice_element.append(base_element) if len(slice_element) > 0: scale_element.append(slice_element) return scale_element if len(scale_element) > 0 else None else: node_element = etree.Element("NODE", attrib=dict(code=strings.slugify(name, separator=u"_"))) for key, value in node.iteritems(): child_element = transform_node_to_element(key, value) if child_element is not None: node_element.append(child_element) return node_element if len(node_element) > 0 else None else: assert isinstance(node, list), node values, format, type = prepare_xml_values(name, node) if not values: return None code_element = etree.Element("CODE", attrib=dict(code=strings.slugify(name, separator=u"_"))) if format is not None: code_element.set("format", format) if type is not None: code_element.set("type", type) for value in values: value_element = transform_value_to_element(value) if value_element is not None: code_element.append(value_element) return code_element if len(code_element) > 0 else None
def index(self, indexed_poi_id): poi_bson = self.bson metadata = poi_bson['metadata'] for category_slug in (metadata.get('categories-index') or set()): self.ids_by_category_slug.setdefault(category_slug, set()).add(indexed_poi_id) if conf['index.date.field']: for date_range_index, date_range_metadata in enumerate(metadata.get('date-range') or []): if date_range_metadata['label'] == conf['index.date.field']: date_range_values = poi_bson['date-range'][date_range_index] date_range_begin = date_range_values.get('date-range-begin', [None])[0] date_range_end = date_range_values.get('date-range-end', [None])[0] if date_range_begin is not None: for index, (begin_datetime, poi_id) in enumerate(self.ids_by_begin_datetime): if begin_datetime is not None and begin_datetime > date_range_begin: break else: index = 0 self.ids_by_begin_datetime.insert(index, (date_range_begin, indexed_poi_id)) if date_range_end is not None: for index, (end_datetime, poi_id) in enumerate(self.ids_by_end_datetime): if end_datetime is not None and end_datetime < date_range_end: break else: index = 0 self.ids_by_end_datetime.insert(index, (date_range_end, indexed_poi_id)) if not metadata.get('date-range'): self.ids_by_begin_datetime.append((None, indexed_poi_id)) self.ids_by_end_datetime.append((None, indexed_poi_id)) self.ids_by_last_update_datetime.append((self.last_update_datetime, indexed_poi_id)) for i, territory_metadata in enumerate(metadata.get('territories') or []): # Note: Don't fail when territory doesn't exist, because Etalage can be configured to ignore some kinds # of territories (cf conf['territories_kinds']). self.competence_territories_id = set( territory_id for territory_id in ( ramdb.territory_id_by_kind_code.get((territory_kind_code['kind'], territory_kind_code['code'])) for territory_kind_code in poi_bson['territories'][i] ) if territory_id is not None ) for territory_id in self.competence_territories_id: self.ids_by_competence_territory_id.setdefault(territory_id, set()).add(indexed_poi_id) break if not self.competence_territories_id: self.ids_by_competence_territory_id.setdefault(None, set()).add(indexed_poi_id) poi_territories_id = set( territory_id for territory_id in ( ramdb.territory_id_by_kind_code.get((territory_kind_code['kind'], territory_kind_code['code'])) for territory_kind_code in metadata['territories-index'] if territory_kind_code['kind'] not in (u'Country', u'InternationalOrganization') ) if territory_id is not None ) if metadata.get('territories-index') is not None else None for territory_id in (poi_territories_id or set()): self.ids_by_presence_territory_id.setdefault(territory_id, set()).add(indexed_poi_id) for word in strings.slugify(self.name).split(u'-'): self.ids_by_word.setdefault(word, set()).add(indexed_poi_id) self.slug_by_id[indexed_poi_id] = strings.slugify(self.name)
def index_list(req): ctx = contexts.Ctx(req) params = req.GET inputs = init_base(ctx, params) inputs.update(model.Poi.extract_search_inputs_from_params(ctx, params)) inputs.update(dict( coverage = params.get('coverage'), page = params.get('page'), poi_index = params.get('poi_index'), sort_key = params.get('sort_key'), )) mode = u'liste' data, errors = conv.inputs_to_pois_list_data(inputs, state = ctx) non_territorial_search_data = model.Poi.extract_non_territorial_search_data(ctx, data) if errors is not None: raise wsgihelpers.bad_request(ctx, explanation = ctx._('Error: {0}').format(errors)) territory = data['geolocation'] or (data['term'] if not isinstance(data['term'], basestring) else None) if non_territorial_search_data.get('term') and not isinstance(non_territorial_search_data['term'], basestring): non_territorial_search_data['term'] = None pois_id_iter = model.Poi.iter_ids( ctx, territory = territory, coverages = None if data['coverage'] is None else [data['coverage']], **non_territorial_search_data) if isinstance(data['term'], basestring): for poi_id in pois_id_iter: poi = model.Poi.instance_by_id[poi_id] if data['term'] == poi.slug: raise wsgihelpers.redirect(ctx, location = urls.get_url(ctx, 'organismes', poi.slug, poi._id)) ids_by_territory_id = dict() multimodal_info_services_by_id = dict() national_territory_id = ramdb.territory_id_by_kind_code[('Country', 'FR')] ids_by_niveau = dict() transport_types_by_id = dict() web_site_by_id = dict() for poi in ( model.Poi.instance_by_id.get(poi_id) for poi_id in pois_id_iter ): if poi is None: continue for field in poi.generate_all_fields(): if poi._id in model.Poi.multimodal_info_service_ids: multimodal_info_services_by_id[poi._id] = poi else: if field.id == 'links' and strings.slugify(field.label) == 'offres-de-transport': for transport_offer in [ transport_offer for transport_offer in ( model.Poi.instance_by_id.get(transport_offer_id) for transport_offer_id in field.value ) if transport_offer is not None ]: for field in transport_offer.fields: field_slug = strings.slugify(field.label) if field_slug == 'type-de-transport' and field.value is not None: transport_types_by_id.setdefault(poi._id, set()).add(field.value) if field.id == 'territories' and strings.slugify(field.label) == 'territoire-couvert': for territory_id in field.value: if isinstance(data['term'], model.Territory) and territory_id in data['term'].ancestors_id: territory = ramdb.territory_by_id[territory_id] if territory.__class__.__name__ != 'UrbanTransportsPerimeterOfFrance': ids_by_territory_id.setdefault(territory_id, set()).add(poi._id) break else: PTU_postal_routing = territory.main_postal_distribution.get('postal_routing') if PTU_postal_routing is not None: for child_territory_id in ramdb.territories_id_by_ancestor_id.get(territory_id): child_territory = ramdb.territory_by_id.get(child_territory_id) if child_territory.__class__.__name__ != 'CommuneOfFrance': continue child_territory_postal_routing = child_territory.main_postal_distribution.get( 'postal_routing' ) if all(map( lambda word: word in child_territory_postal_routing.split(), PTU_postal_routing.split(), )): ids_by_territory_id.setdefault(child_territory_id, set()).add(poi._id) break else: ids_by_territory_id.setdefault(national_territory_id, set()).add(poi._id) if field.id == 'select' and strings.slugify(field.label) == 'niveau': ids_by_niveau_key = { 'local': 'local', 'locale': 'local', 'national': 'national', 'departemental': 'departmental', 'regional': 'regional', }.get(strings.slugify(field.value)) ids_by_niveau.setdefault(ids_by_niveau_key, set()).add(poi._id) if field.id == 'url' and strings.slugify(field.label) == 'site-web-url': web_site_by_id[poi._id] = field.value elif field.id == 'url' and web_site_by_id.get(poi._id) is None: web_site_by_id[poi._id] = field.value multimodal_info_services = model.Poi.sort_and_paginate_pois_list( ctx, None, multimodal_info_services_by_id, multimodal_info_services = True, ) return templates.render( ctx, '/list.mako', data = data, errors = errors, ids_by_territory_id = ids_by_territory_id, inputs = inputs, mode = mode, multimodal_info_services = multimodal_info_services, ids_by_niveau = ids_by_niveau, transport_types_by_id = transport_types_by_id, web_site_by_id = web_site_by_id, **non_territorial_search_data)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dir', default = 'Baremes_IPP_2015', help = 'path of IPP XLS directory') parser.add_argument('-v', '--verbose', action = 'store_true', default = False, help = "increase output verbosity") args = parser.parse_args() # args.dir = path logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) root_node = dict( children = [], name = "root", text = textwrap.dedent(u"""\ Ce document présente l'ensemble de la législation permettant le calcul des contributions sociales, taxes sur les salaires et cotisations sociales. Il s'agit des barèmes bruts de la législation utilisés dans le micro-simulateur de l'IPP, TAXIPP. Les sources législatives (texte de loi, numéro du décret ou arrêté) ainsi que la date de publication au Journal Officiel de la République française (JORF) sont systématiquement indiquées. La première ligne du fichier (masquée) indique le nom des paramètres dans TAXIPP. Citer cette source : Barèmes IPP: prélèvements sociaux, Institut des politiques publiques, avril 2014. Auteurs : Antoine Bozio, Julien Grenet, Malka Guillot, Laura Khoury et Marianne Tenand Contacts : [email protected]; [email protected]; [email protected] Licence : Licence ouverte / Open Licence """).split(u'\n'), title = u"Barème IPP", type = u'NODE', ) for bareme in baremes: xls_path = os.path.join(args.dir.decode('utf-8'), u"Baremes IPP - {0}.xls".format(bareme)) if not os.path.exists(xls_path): log.warning("Skipping file {} that doesn't exist: {}".format(bareme, xls_path)) continue log.info(u'Parsing file {}'.format(bareme)) book = xlrd.open_workbook(filename = xls_path, formatting_info = True) sheet_names = [ sheet_name for sheet_name in book.sheet_names() if not sheet_name.startswith((u'Abréviations', u'Outline')) and sheet_name not in forbiden_sheets.get( bareme, []) ] sheet_title_by_name = {} for sheet_name in sheet_names: log.info(u' Parsing sheet {}'.format(sheet_name)) sheet = book.sheet_by_name(sheet_name) # Extract coordinates of merged cells. merged_cells_tree = {} for row_low, row_high, column_low, column_high in sheet.merged_cells: for row_index in range(row_low, row_high): cell_coordinates_by_merged_column_index = merged_cells_tree.setdefault( row_index, {}) for column_index in range(column_low, column_high): cell_coordinates_by_merged_column_index[column_index] = (row_low, column_low) if sheet_name.startswith(u'Sommaire'): # Associate the titles of the sheets to their Excel names. for row_index in range(sheet.nrows): linked_sheet_number = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 2) if isinstance(linked_sheet_number, int): linked_sheet_title = transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, 3) if linked_sheet_title is not None: hyperlink = get_hyperlink(sheet, row_index, 3) if hyperlink is not None and hyperlink.type == u'workbook': linked_sheet_name = hyperlink.textmark.split(u'!', 1)[0].strip(u'"').strip(u"'") sheet_title_by_name[linked_sheet_name] = linked_sheet_title continue descriptions_rows = [] labels_rows = [] notes_rows = [] state = 'taxipp_names' taxipp_names_row = None values_rows = [] for row_index in range(sheet.nrows): columns_count = len(sheet.row_values(row_index)) if state == 'taxipp_names': taxipp_names_row = [ taxipp_name for taxipp_name in ( transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ) ] state = 'labels' continue if state == 'labels': first_cell_value = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0) date_or_year, error = conv.pipe( conv.test_isinstance((int, basestring)), cell_to_date, conv.not_none, )(first_cell_value, state = conv.default_state) if error is not None: # First cell of row is not a date => Assume it is a label. labels_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ]) continue state = 'values' if state == 'values': first_cell_value = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0) if first_cell_value is None or isinstance(first_cell_value, (int, basestring)): date_or_year, error = cell_to_date(first_cell_value, state = conv.default_state) if error is None: # First cell of row is a valid date or year. values_row = [ transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ] if date_or_year is not None: assert date_or_year.year < 2601, 'Invalid date {} in {} at row {}'.format(date_or_year, sheet_name, row_index + 1) values_rows.append(values_row) continue if all(value in (None, u'') for value in values_row): # If first cell is empty and all other cells in line are also empty, ignore this line. continue # First cell has no date and other cells in row are not empty => Assume it is a note. state = 'notes' if state == 'notes': first_cell_value = transform_xls_cell_to_json(book, sheet, merged_cells_tree, row_index, 0) if isinstance(first_cell_value, basestring) and first_cell_value.strip().lower() == 'notes': notes_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ]) continue state = 'description' assert state == 'description' descriptions_rows.append([ transform_xls_cell_to_str(book, sheet, merged_cells_tree, row_index, column_index) for column_index in range(columns_count) ]) text_lines = [] for row in notes_rows: text_lines.append(u' | '.join( cell for cell in row if cell )) if text_lines: text_lines.append(None) for row in descriptions_rows: text_lines.append(u' | '.join( cell for cell in row if cell )) sheet_title = sheet_title_by_name.get(sheet_name) if sheet_title is None: log.warning(u"Missing title for sheet {} in summary".format(sheet_name)) continue labels = [] for labels_row in labels_rows: for column_index, label in enumerate(labels_row): if not label: continue while column_index >= len(labels): labels.append([]) labels_column = labels[column_index] if not labels_column or labels_column[-1] != label: labels_column.append(label) labels = [ tuple(labels_column1) if len(labels_column1) > 1 else labels_column1[0] for labels_column1 in labels ] cell_by_label_rows = [] for value_row in values_rows: cell_by_label = collections.OrderedDict(itertools.izip(labels, value_row)) cell_by_label, errors = values_row_converter(cell_by_label, state = conv.default_state) assert errors is None, "Errors in {}:\n{}".format(cell_by_label, errors) cell_by_label_rows.append(cell_by_label) sheet_node = dict( children = [], name = strings.slugify(sheet_name, separator = u'_'), text = text_lines, title = sheet_title, type = u'NODE', ) root_node['children'].append(sheet_node) for taxipp_name, labels_column in zip(taxipp_names_row, labels): if not taxipp_name or taxipp_name in (u'date',): continue variable_node = dict( children = [], name = strings.slugify(taxipp_name, separator = u'_'), title = u' - '.join(labels_column) if isinstance(labels_column, tuple) else labels_column, type = u'CODE', ) sheet_node['children'].append(variable_node) for cell_by_label in cell_by_label_rows: amount_and_unit = cell_by_label[labels_column] variable_node['children'].append(dict( law_reference = cell_by_label[u'Références législatives'], notes = cell_by_label[u'Notes'], publication_date = cell_by_label[u"Parution au JO"], start_date = cell_by_label[u"Date d'entrée en vigueur"], type = u'VALUE', unit = amount_and_unit[1] if isinstance(amount_and_unit, tuple) else None, value = amount_and_unit[0] if isinstance(amount_and_unit, tuple) else amount_and_unit, )) # dates = [ # conv.check(cell_to_date)( # row[1] if bareme == u'Impot Revenu' else row[0], # state = conv.default_state, # ) # for row in values_rows # ] # for column_index, taxipp_name in enumerate(taxipp_names_row): # if taxipp_name and strings.slugify(taxipp_name) not in ( # 'date', # 'date-ir', # 'date-rev', # 'note', # 'notes', # 'ref-leg', # ): # vector = [ # transform_cell_value(date, row[column_index]) # for date, row in zip(dates, values_rows) # ] # vector = [ # cell if not isinstance(cell, basestring) or cell == u'nc' else '-' # for cell in vector # ] # # vector_by_taxipp_name[taxipp_name] = pd.Series(vector, index = dates) # vector_by_taxipp_name[taxipp_name] = vector # print_node(root_node) return 0