def read_unicode(fn): """Read an Unicode file that may encode with utf_16_le, utf_16_be, or utf_8. """ from codecs import BOM_UTF16_LE, BOM_UTF16_BE, BOM_UTF8 with open(fn, "rb") as in_file: bs = in_file.read() if bs.startswith(BOM_UTF16_LE): us = bs.decode("utf_16_le").lstrip(BOM_UTF16_LE.decode("utf_16_le")) elif bs.startswith(BOM_UTF16_BE): us = bs.decode("utf_16_be").lstrip(BOM_UTF16_BE.decode("utf_16_be")) else: us = bs.decode("utf_8").lstrip(BOM_UTF8.decode("utf_8")) return us
def test_import_stops_txt_bom(self): if PY3: # pragma: no cover text = (BOM_UTF8.decode('utf-8') + """\ stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,\ location_type,parent_station,stop_timezone FUR_CREEK_RES,FC,Furnace Creek Resort,,36.425288,-117.133162,A,\ http://example.com/fcr,0,FUR_CREEK_STA, FUR_CREEK_STA,,Furnace Creek Station,"Our Station",36.425288,-117.133162,A,\ http://example.com,1,,America/Los_Angeles """) else: text = (BOM_UTF8 + b"""\ stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,\ location_type,parent_station,stop_timezone FUR_CREEK_RES,FC,Furnace Creek Resort,,36.425288,-117.133162,A,\ http://example.com/fcr,0,FUR_CREEK_STA, FUR_CREEK_STA,,Furnace Creek Station,"Our Station",36.425288,-117.133162,A,\ http://example.com,1,,America/Los_Angeles """) stops_txt = StringIO(text) Stop.import_txt(stops_txt, self.feed) self.assertEqual(Stop.objects.count(), 2) station = Stop.objects.get(stop_id='FUR_CREEK_STA') stop = Stop.objects.get(stop_id='FUR_CREEK_RES') self.assertEqual(stop.parent_station, station)
def test_utf8_bom(): unicode_bom = BOM_UTF8.decode('utf-8') module = parso.parse(unicode_bom) endmarker = module.children[0] assert endmarker.type == 'endmarker' assert unicode_bom == endmarker.prefix module = parso.parse(unicode_bom + 'foo = 1') expr_stmt = module.children[0] assert expr_stmt.type == 'expr_stmt' assert unicode_bom == expr_stmt.get_first_leaf().prefix
def bom_prefix_csv(text): """ Prefix CSV text with a Byte-order Marker (BOM). The return value needs to be encoded differently so the CSV reader will handle the BOM correctly: - Python 2 returns a UTF-8 encoded bytestring - Python 3 returns unicode text """ if PY3: return BOM_UTF8.decode('utf-8') + text else: return BOM_UTF8 + text.encode('utf-8')
def test_import_bom(self): if PY3: # pragma: no cover text = (BOM_UTF8.decode('utf-8') + """\ agency_name,agency_url,agency_timezone Demo Transit Authority,http://google.com,America/Los_Angeles """) else: text = (BOM_UTF8 + b"""\ agency_name,agency_url,agency_timezone Demo Transit Authority,http://google.com,America/Los_Angeles """) agency_txt = StringIO(text) Agency.import_txt(agency_txt, self.feed) agency = Agency.objects.get() self.assertEqual(agency.agency_id, '') self.assertEqual(agency.name, 'Demo Transit Authority') self.assertEqual(agency.url, 'http://google.com') self.assertEqual(agency.timezone, 'America/Los_Angeles') self.assertEqual(agency.lang, '') self.assertEqual(agency.phone, '') self.assertEqual(agency.fare_url, '')
def test_eval_bom(self): self.assertEqual(eval(BOM_UTF8 + '"foo"'), 'foo') # Actual BOM ignored, so causes a SyntaxError self.assertRaises(SyntaxError, eval, BOM_UTF8.decode('iso-8859-1') + '"foo"')
decode_utf8 = decode_string encode_utf8 = encode_string #### CACHE ######################################################################################### # Caching is implemented in URL.download(), which is used by all other downloaders. import os import glob import tempfile import datetime from io import open from codecs import BOM_UTF8 BOM_UTF8 = BOM_UTF8.decode('utf-8') try: MODULE = os.path.dirname(os.path.realpath(__file__)) except: MODULE = "" TMP = os.path.join(tempfile.gettempdir(), "pattern_web") def date_now(): return datetime.datetime.today() def date_modified(path): return datetime.datetime.fromtimestamp(os.stat(path)[8])
from itertools import chain import os import sys try: from urllib.request import urlopen except ImportError: from urllib import urlopen from .__init__ import Graph, Node, Edge, bfs from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS from codecs import BOM_UTF8 if sys.version > "3": BOM_UTF8 = BOM_UTF8.decode("utf-8") basestring = str try: MODULE = os.path.dirname(os.path.realpath(__file__)) except: MODULE = "" #### COMMONSENSE SEMANTIC NETWORK ######################################## #--- CONCEPT ------------------------------------------------------------- class Concept(Node):
try: from itertools import zip_longest except ImportError: # Python 2 from itertools import izip_longest as zip_longest from codecs import BOM_UTF8 import pytest import parso unicode_bom = BOM_UTF8.decode('utf-8') @pytest.mark.parametrize(('string', 'tokens'), [ ('', ['']), ('#', ['#', '']), (' # ', ['# ', '']), (' # \n', ['# ', '\n', '']), (' # \f\n', ['# ', '\f', '\n', '']), (' \n', ['\n', '']), (' \n ', ['\n', ' ']), (' \f ', ['\f', ' ']), (' \f ', ['\f', ' ']), (' \r\n', ['\r\n', '']), ('\\\n', ['\\\n', '']), ('\\\r\n', ['\\\r\n', '']), ('\t\t\n\t', ['\n', '\t']), ]) def test_simple_prefix_splitting(string, tokens):
def __enter__(self): with open(self.filename, encoding='utf-8') as infile: content = infile.read() if content[0] == BOM_UTF8.decode('utf8'): content = content[1:] return content
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap, NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT, ERROR_DEDENT, FSTRING_STRING, FSTRING_START, FSTRING_END) from parso._compatibility import py_version from parso.utils import split_lines TokenCollection = namedtuple( 'TokenCollection', 'pseudo_token single_quoted triple_quoted endpats whitespace ' 'fstring_pattern_map always_break_tokens', ) BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') _token_collection_cache = {} if py_version >= 30: # Python 3 has str.isidentifier() to check if a char is a valid identifier is_identifier = str.isidentifier else: namechars = string.ascii_letters + '_' is_identifier = lambda s: s in namechars def group(*choices, **kwargs): capture = kwargs.pop('capture', False) # Python 2, arrghhhhh :( assert not kwargs
from datetime import datetime, date from logging import getLogger import re from django.contrib.gis.db import models from django.contrib.gis.db.models.query import GeoQuerySet from django.db.models.fields.related import ManyToManyField from django.utils.six import StringIO, text_type, PY3 from multigtfs.compat import get_blank_value, write_text_rows logger = getLogger(__name__) re_point = re.compile(r'(?P<name>point)\[(?P<index>\d)\]') batch_size = 1000 large_queryset_size = 100000 CSV_BOM = BOM_UTF8.decode('utf-8') if PY3 else BOM_UTF8 class BaseQuerySet(GeoQuerySet): def populated_column_map(self): '''Return the _column_map without unused optional fields''' column_map = [] cls = self.model for csv_name, field_pattern in cls._column_map: # Separate the local field name from foreign columns if '__' in field_pattern: field_name = field_pattern.split('__', 1)[0] else: field_name = field_pattern # Handle point fields
from __future__ import absolute_import import itertools as _itertools import re import sys from codecs import BOM_UTF8 from collections import namedtuple from dataclasses import dataclass from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple from libcst._parser.parso.python.token import PythonTokenTypes from libcst._parser.parso.utils import PythonVersionInfo, split_lines # Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) MAX_UNICODE = "\U0010ffff" BOM_UTF8_STRING = BOM_UTF8.decode("utf-8") STRING = PythonTokenTypes.STRING NAME = PythonTokenTypes.NAME NUMBER = PythonTokenTypes.NUMBER OP = PythonTokenTypes.OP NEWLINE = PythonTokenTypes.NEWLINE INDENT = PythonTokenTypes.INDENT DEDENT = PythonTokenTypes.DEDENT ASYNC = PythonTokenTypes.ASYNC AWAIT = PythonTokenTypes.AWAIT ENDMARKER = PythonTokenTypes.ENDMARKER ERRORTOKEN = PythonTokenTypes.ERRORTOKEN ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT FSTRING_START = PythonTokenTypes.FSTRING_START FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
from itertools import zip_longest from codecs import BOM_UTF8 import pytest import parso unicode_bom = BOM_UTF8.decode('utf-8') @pytest.mark.parametrize(('string', 'tokens'), [ ('', ['']), ('#', ['#', '']), (' # ', ['# ', '']), (' # \n', ['# ', '\n', '']), (' # \f\n', ['# ', '\f', '\n', '']), (' \n', ['\n', '']), (' \n ', ['\n', ' ']), (' \f ', ['\f', ' ']), (' \f ', ['\f', ' ']), (' \r\n', ['\r\n', '']), (' \r', ['\r', '']), ('\\\n', ['\\\n', '']), ('\\\r\n', ['\\\r\n', '']), ('\t\t\n\t', ['\n', '\t']), ]) def test_simple_prefix_splitting(string, tokens): tree = parso.parse(string) leaf = tree.children[0] assert leaf.type == 'endmarker'
from csv import reader, writer from datetime import datetime, date from logging import getLogger import re from django.contrib.gis.db import models from django.db.models.fields.related import ManyToManyField from django.utils.six import StringIO, text_type, PY3 from multigtfs.compat import (get_blank_value, write_text_rows, Manager, QuerySet) logger = getLogger(__name__) re_point = re.compile(r'(?P<name>point)\[(?P<index>\d)\]') batch_size = 1000 CSV_BOM = BOM_UTF8.decode('utf-8') if PY3 else BOM_UTF8 class BaseQuerySet(QuerySet): def populated_column_map(self): '''Return the _column_map without unused optional fields''' column_map = [] cls = self.model for csv_name, field_pattern in cls._column_map: # Separate the local field name from foreign columns if '__' in field_pattern: field_name = field_pattern.split('__', 1)[0] else: field_name = field_pattern # Handle point fields
if os.path.exists(fullname): return fullname return None # }}} # {{{ file encoding detection # the main idea stolen from Python 3.1's tokenize.py, by Ka-Ping Yee import re cookie_re = re.compile("^\s*#.*coding[:=]\s*([-\w.]+)") from codecs import lookup, BOM_UTF8 if PY3: BOM_UTF8 = BOM_UTF8.decode() def detect_encoding(lines): """ The detect_encoding() function is used to detect the encoding that should be used to decode a Python source file. It requires one argment, lines, iterable lines stream. It will read a maximum of two lines, and return the encoding used (as a string) and a list of any lines (left as bytes) it has read in. It detects the encoding from the presence of a utf-8 bom or an encoding cookie as specified in pep-0263. If both a bom and a cookie are present, but disagree, a SyntaxError will be raised. If the encoding cookie is an
from codecs import BOM_UTF8 import agate import datetime import isodate import json import dbt.utils from typing import Iterable, List, Dict, Union, Optional, Any from dbt.exceptions import RuntimeException BOM = BOM_UTF8.decode('utf-8') # '\ufeff' class ISODateTime(agate.data_types.DateTime): def cast(self, d): # this is agate.data_types.DateTime.cast with the "clever" bits removed # so we only handle ISO8601 stuff if isinstance(d, datetime.datetime) or d is None: return d elif isinstance(d, datetime.date): return datetime.datetime.combine(d, datetime.time(0, 0, 0)) elif isinstance(d, str): d = d.strip() if d.lower() in self.null_values: return None try: return isodate.parse_datetime(d) except: # noqa pass
# Python 3 from urllib.request import urlopen from .__init__ import Graph, Node, Edge, bfs from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS import os import sys try: MODULE = os.path.dirname(os.path.realpath(__file__)) except: MODULE = "" if sys.version > "3": BOM_UTF8 = str(BOM_UTF8.decode("utf-8")) else: BOM_UTF8 = BOM_UTF8.decode("utf-8") #### COMMONSENSE SEMANTIC NETWORK ################################################################## #--- CONCEPT --------------------------------------------------------------------------------------- class Concept(Node): def __init__(self, *args, **kwargs): """ A concept in the sematic network. """ Node.__init__(self, *args, **kwargs) self._properties = None
from itertools import chain import os import sys try: from urllib.request import urlopen except ImportError: from urllib import urlopen from .__init__ import Graph, Node, Edge, bfs from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS from codecs import BOM_UTF8 if sys.version > "3": BOM_UTF8 = BOM_UTF8.decode("utf-8") basestring = str try: MODULE = os.path.dirname(os.path.realpath(__file__)) except: MODULE = "" #### COMMONSENSE SEMANTIC NETWORK ######################################## #--- CONCEPT ------------------------------------------------------------- class Concept(Node): def __init__(self, *args, **kwargs):
INDENT = PythonTokenTypes.INDENT DEDENT = PythonTokenTypes.DEDENT ENDMARKER = PythonTokenTypes.ENDMARKER ERRORTOKEN = PythonTokenTypes.ERRORTOKEN ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT FSTRING_START = PythonTokenTypes.FSTRING_START FSTRING_STRING = PythonTokenTypes.FSTRING_STRING FSTRING_END = PythonTokenTypes.FSTRING_END TokenCollection = namedtuple( 'TokenCollection', 'pseudo_token single_quoted triple_quoted endpats whitespace ' 'fstring_pattern_map always_break_tokens', ) BOM_UTF8_STRING = BOM_UTF8.decode('utf-8') _token_collection_cache = {} if sys.version_info.major >= 3: # Python 3 has str.isidentifier() to check if a char is a valid identifier is_identifier = str.isidentifier else: # Python 2 doesn't, but it's not that important anymore and if you tokenize # Python 2 code with this, it's still ok. It's just that parsing Python 3 # code with this function is not 100% correct. # This just means that Python 2 code matches a few identifiers too much, # but that doesn't really matter. def is_identifier(s): return True
# Copyright 2017, Jarsa Sistemas, S.A. de C.V. # License LGPL-3.0 or later (http://www.gnu.org/licenses/lgpl). import base64 from codecs import BOM_UTF8 from suds.client import Client from odoo import _, api, models, tools from odoo.tools.float_utils import float_repr BOM_UTF8U = BOM_UTF8.decode('UTF-8') CFDI_SAT_QR_STATE = { 'No Encontrado': 'not_found', 'Cancelado': 'cancelled', 'Vigente': 'valid', } class AccountInvoice(models.Model): _inherit = 'account.invoice' @api.multi def generate_xml_attachment(self): self.ensure_one() if not self.l10n_mx_edi_cfdi: return False fname = ("%s-%s-MX-Bill-%s.xml" % (self.journal_id.code, self.reference, self.company_id.partner_id.vat or '')).replace('/', '') data_attach = {
def import_txt(cls, txt_file, feed, filter_func=None): '''Import from the GTFS text file''' # Setup the conversion from GTFS to Django Format # Conversion functions def no_convert(value): return value def date_convert(value): return datetime.strptime(value, '%Y%m%d') def bool_convert(value): return (value == '1') def char_convert(value): return (value or '') def null_convert(value): return (value or None) def point_convert(value): return (value or 0.0) cache = {} def default_convert(field): def get_value_or_default(value): if value == '' or value is None: return field.get_default() else: return value return get_value_or_default def instance_convert(field, feed, rel_name): def get_instance(value): if value.strip(): key1 = "{}:{}".format(field.rel.to.__name__, rel_name) key2 = text_type(value) # Load existing objects if key1 not in cache: pairs = field.rel.to.objects.filter( **{field.rel.to._rel_to_feed: feed}).values_list( rel_name, 'id') cache[key1] = dict((text_type(x), i) for x, i in pairs) # Create new? if key2 not in cache[key1]: kwargs = { field.rel.to._rel_to_feed: feed, rel_name: value} cache[key1][key2] = field.rel.to.objects.create( **kwargs).id return cache[key1][key2] else: return None return get_instance # Check unique fields column_names = [c for c, _ in cls._column_map] for unique_field in cls._unique_fields: assert unique_field in column_names, \ '{} not in {}'.format(unique_field, column_names) # Map of field_name to converters from GTFS to Django format val_map = dict() name_map = dict() point_map = dict() for csv_name, field_pattern in cls._column_map: # Separate the local field name from foreign columns if '__' in field_pattern: field_base, rel_name = field_pattern.split('__', 1) field_name = field_base + '_id' else: field_name = field_base = field_pattern # Use the field name in the name mapping name_map[csv_name] = field_name # Is it a point field? point_match = re_point.match(field_name) if point_match: field = None else: field = cls._meta.get_field_by_name(field_base)[0] # Pick a conversion function for the field if point_match: converter = point_convert elif isinstance(field, models.DateField): converter = date_convert elif isinstance(field, models.BooleanField): converter = bool_convert elif isinstance(field, models.CharField): converter = char_convert elif field.rel: converter = instance_convert(field, feed, rel_name) assert not isinstance(field, models.ManyToManyField) elif field.null: converter = null_convert elif field.has_default(): converter = default_convert(field) else: converter = no_convert if point_match: index = int(point_match.group('index')) point_map[csv_name] = (index, converter) else: val_map[csv_name] = converter # Read and convert the source txt csv_reader = reader(txt_file) unique_line = dict() count = 0 first = True extra_counts = defaultdict(int) if PY3: # pragma: no cover bom = BOM_UTF8.decode('utf-8') else: # pragma: no cover bom = BOM_UTF8 new_objects = [] for row in csv_reader: if first: # Read the columns columns = row if columns[0].startswith(bom): columns[0] = columns[0][len(bom):] first = False continue if filter_func and not filter_func(zip(columns, row)): continue # Read a data row fields = dict() point_coords = [None, None] ukey_values = {} if cls._rel_to_feed == 'feed': fields['feed'] = feed for column_name, value in zip(columns, row): if column_name not in name_map: val = null_convert(value) if val is not None: fields.setdefault('extra_data', {})[column_name] = val extra_counts[column_name] += 1 elif column_name in val_map: fields[name_map[column_name]] = val_map[column_name](value) else: assert column_name in point_map pos, converter = point_map[column_name] point_coords[pos] = converter(value) # Is it part of the unique key? if column_name in cls._unique_fields: ukey_values[column_name] = value # Join the lat/long into a point if point_map: assert point_coords[0] and point_coords[1] fields['point'] = "POINT(%s)" % (' '.join(point_coords)) # Is the item unique? """ ukey = tuple(ukey_values.get(u) for u in cls._unique_fields) if ukey in unique_line: logger.warning( '%s line %d is a duplicate of line %d, not imported.', cls._filename, csv_reader.line_num, unique_line[ukey]) continue else: unique_line[ukey] = csv_reader.line_num """ # Create after accumulating a batch new_objects.append(cls(**fields)) if len(new_objects) % batch_size == 0: # pragma: no cover cls.objects.bulk_create(new_objects) count += len(new_objects) logger.info( "Imported %d %s", count, cls._meta.verbose_name_plural) new_objects = [] # Create remaining objects if new_objects: cls.objects.bulk_create(new_objects) # Take note of extra fields if extra_counts: extra_columns = feed.meta.setdefault( 'extra_columns', {}).setdefault(cls.__name__, []) for column in columns: if column in extra_counts and column not in extra_columns: extra_columns.append(column) feed.save() return len(unique_line)
dirname = os.readlink(dirname) fullname = os.path.join(dirname, filename) if os.path.exists(fullname): return fullname return None # }}} # {{{ file encoding detection # stolen from Python 3.1's tokenize.py, by Ka-Ping Yee import re cookie_re = re.compile("^\s*#.*coding[:=]\s*([-\w.]+)") from codecs import lookup, BOM_UTF8 if PY3: BOM_UTF8 = BOM_UTF8.decode() def detect_encoding(readline): """ The detect_encoding() function is used to detect the encoding that should be used to decode a Python source file. It requires one argment, readline, in the same way as the tokenize() generator. It will call readline a maximum of twice, and return the encoding used (as a string) and a list of any lines (left as bytes) it has read in. It detects the encoding from the presence of a utf-8 bom or an encoding cookie as specified in pep-0263. If both a bom and a cookie are present, but disagree, a SyntaxError will be raised. If the encoding cookie is an
def lstrip_bom(str_, bom=BOM_UTF8.decode('utf-8-sig')): if str_.startswith(bom): return str_[len(bom):] else: return str_
def __enter__(self): content = open(self.filename, encoding="utf-8").read() if content[0] == BOM_UTF8.decode("utf8"): content = content[1:] return content
# Fixed XML when is not decoded import oerplib import argparse import base64 from lxml import objectify from codecs import BOM_UTF8 BOM_UTF8U = BOM_UTF8.decode('UTF-8') PARSER = argparse.ArgumentParser() PARSER.add_argument("-d", "--db", help="DataBase Name", required=True) PARSER.add_argument("-r", "--user", help="OpenERP User", required=True) PARSER.add_argument("-w", "--passwd", help="OpenERP Password", required=True) PARSER.add_argument("-p", "--port", type=int, help="Port, 8069 for default", default="8069") PARSER.add_argument("-s", "--server", help="Server IP, 127.0.0.1 for default", default="127.0.0.1") ARGS = PARSER.parse_args() if ARGS.db is None or ARGS.user is None or ARGS.passwd is None: print "Must be specified DataBase, User and Password" quit() DB_NAME = ARGS.db USER = ARGS.user PASSW = ARGS.passwd SERVER = ARGS.server PORT = ARGS.port OERP_CONNECT = oerplib.OERP(SERVER,