Ejemplo n.º 1
1
def read_unicode(fn):
    """Read an Unicode file that may encode with utf_16_le, utf_16_be, or utf_8.
    """
    from codecs import BOM_UTF16_LE, BOM_UTF16_BE, BOM_UTF8

    with open(fn, "rb") as in_file:
        bs = in_file.read()

    if  bs.startswith(BOM_UTF16_LE):
        us = bs.decode("utf_16_le").lstrip(BOM_UTF16_LE.decode("utf_16_le"))
    elif  bs.startswith(BOM_UTF16_BE):
        us = bs.decode("utf_16_be").lstrip(BOM_UTF16_BE.decode("utf_16_be"))
    else:
        us = bs.decode("utf_8").lstrip(BOM_UTF8.decode("utf_8"))

    return us
Ejemplo n.º 2
0
    def test_import_stops_txt_bom(self):
        if PY3:  # pragma: no cover
            text = (BOM_UTF8.decode('utf-8') + """\
stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,\
location_type,parent_station,stop_timezone
FUR_CREEK_RES,FC,Furnace Creek Resort,,36.425288,-117.133162,A,\
http://example.com/fcr,0,FUR_CREEK_STA,
FUR_CREEK_STA,,Furnace Creek Station,"Our Station",36.425288,-117.133162,A,\
http://example.com,1,,America/Los_Angeles
""")
        else:
            text = (BOM_UTF8 + b"""\
stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,\
location_type,parent_station,stop_timezone
FUR_CREEK_RES,FC,Furnace Creek Resort,,36.425288,-117.133162,A,\
http://example.com/fcr,0,FUR_CREEK_STA,
FUR_CREEK_STA,,Furnace Creek Station,"Our Station",36.425288,-117.133162,A,\
http://example.com,1,,America/Los_Angeles
""")
        stops_txt = StringIO(text)
        Stop.import_txt(stops_txt, self.feed)
        self.assertEqual(Stop.objects.count(), 2)
        station = Stop.objects.get(stop_id='FUR_CREEK_STA')
        stop = Stop.objects.get(stop_id='FUR_CREEK_RES')
        self.assertEqual(stop.parent_station, station)
Ejemplo n.º 3
0
    def test_import_stops_txt_bom(self):
        if PY3:  # pragma: no cover
            text = (BOM_UTF8.decode('utf-8') + """\
stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,\
location_type,parent_station,stop_timezone
FUR_CREEK_RES,FC,Furnace Creek Resort,,36.425288,-117.133162,A,\
http://example.com/fcr,0,FUR_CREEK_STA,
FUR_CREEK_STA,,Furnace Creek Station,"Our Station",36.425288,-117.133162,A,\
http://example.com,1,,America/Los_Angeles
""")
        else:
            text = (BOM_UTF8 + b"""\
stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,\
location_type,parent_station,stop_timezone
FUR_CREEK_RES,FC,Furnace Creek Resort,,36.425288,-117.133162,A,\
http://example.com/fcr,0,FUR_CREEK_STA,
FUR_CREEK_STA,,Furnace Creek Station,"Our Station",36.425288,-117.133162,A,\
http://example.com,1,,America/Los_Angeles
""")
        stops_txt = StringIO(text)
        Stop.import_txt(stops_txt, self.feed)
        self.assertEqual(Stop.objects.count(), 2)
        station = Stop.objects.get(stop_id='FUR_CREEK_STA')
        stop = Stop.objects.get(stop_id='FUR_CREEK_RES')
        self.assertEqual(stop.parent_station, station)
Ejemplo n.º 4
0
    def _load(self, file_path, har_enc='utf-8'):
        self.file_name = file_path.split('/')[-1]

        try:
            with open(file_path, 'r', encoding=har_enc) as fi:
                har_text = fi.read().replace(BOM_UTF8.decode('utf-8-sig'), "")

        except FileNotFoundError:
            return None

        return har_text
Ejemplo n.º 5
0
def test_utf8_bom():
    unicode_bom = BOM_UTF8.decode('utf-8')

    module = parso.parse(unicode_bom)
    endmarker = module.children[0]
    assert endmarker.type == 'endmarker'
    assert unicode_bom == endmarker.prefix

    module = parso.parse(unicode_bom + 'foo = 1')
    expr_stmt = module.children[0]
    assert expr_stmt.type == 'expr_stmt'
    assert unicode_bom == expr_stmt.get_first_leaf().prefix
Ejemplo n.º 6
0
def test_utf8_bom():
    unicode_bom = BOM_UTF8.decode('utf-8')

    module = marso.parse(unicode_bom)
    endmarker = module.children[0]
    assert endmarker.type == 'endmarker'
    assert unicode_bom == endmarker.prefix

    module = marso.parse(unicode_bom + 'foo = 1')
    expr_stmt = module.children[0]
    assert expr_stmt.type == 'expr_stmt'
    assert unicode_bom == expr_stmt.get_first_leaf().prefix
Ejemplo n.º 7
0
def bom_prefix_csv(text):
    """
    Prefix CSV text with a Byte-order Marker (BOM).

    The return value needs to be encoded differently so the CSV reader will
    handle the BOM correctly:
    - Python 2 returns a UTF-8 encoded bytestring
    - Python 3 returns unicode text
    """
    if PY3:
        return BOM_UTF8.decode('utf-8') + text
    else:
        return BOM_UTF8 + text.encode('utf-8')
Ejemplo n.º 8
0
 def get(self, k, unicode=True):
     """ Returns the data stored with the given id.
         With unicode=True, returns a Unicode string.
     """
     if k in self:
         f = open(self._hash(k), "rb")
         v = f.read().lstrip(BOM_UTF8.encode("utf-8"))
         f.close()
         if unicode is True:
             return decode_utf8(v)
         else:
             return v
     raise KeyError(k)
Ejemplo n.º 9
0
 def get(self, k, unicode=True):
     """ Returns the data stored with the given id.
         With unicode=True, returns a Unicode string.
     """
     if k in self:
         f = open(self._hash(k), "rb")
         v = f.read().lstrip(BOM_UTF8.encode("utf-8"))
         f.close()
         if unicode is True:
             return decode_utf8(v)
         else:
             return v
     raise KeyError(k)
Ejemplo n.º 10
0
def bom_prefix_csv(text):
    """
    Prefix CSV text with a Byte-order Marker (BOM).

    The return value needs to be encoded differently so the CSV reader will
    handle the BOM correctly:
    - Python 2 returns a UTF-8 encoded bytestring
    - Python 3 returns unicode text
    """
    if PY3:
        return BOM_UTF8.decode("utf-8") + text
    else:
        return BOM_UTF8 + text.encode("utf-8")
Ejemplo n.º 11
0
    def handle(self, *args, **options):
        i = options['input']
     
        if not os.path.isfile(i):
            raise CommandError

        def no_convert(value): return value
        def date_convert(value): return datetime.strptime(value, '%Y%m%d')
        def bool_convert(value): return (value == '1')
        def char_convert(value): return (value or '')
        def null_convert(value): return (value or None)
        def point_convert(value): return (value or 0.0)
        def int_convert_capacity(value): 
            try:
                return int(float(value))
            except:
                return -1

        print(sum(1 for line in open(i, 'r')))
        csv_reader = reader(open(i, 'r'))
        CSV_BOM = BOM_UTF8.decode('utf-8') if PY3 else BOM_UTF8
        
        first = True
        for i,row in enumerate(csv_reader):
            if i  % 1000 == 0:
                print(i)
            if first:
                # Read the columns
                columns = row
                if columns[0].startswith(CSV_BOM):
                    columns[0] = columns[0][len(CSV_BOM):]
                first = False
                print(columns)
                continue
            try:
                _, created = Capacity.objects.get_or_create(
                    trip_id=int(float(row[7])),
                    stop_time_id=int(float(row[10])),
                    service_date_id=int(float(row[2])),
                    capacity1st=int_convert_capacity(row[-2]),
                    capacity2nd=int_convert_capacity(row[-1]),
                )
                pass


            except Exception as e:
                self.stdout.write("Error with row {} : {}".format(row, e))


        self.stdout.write("Done")
Ejemplo n.º 12
0
def read_unicode(fn):
    """Read an Unicode file that may encode with utf_16_le, utf_16_be, or utf_8.
    """
    from codecs import BOM_UTF16_LE, BOM_UTF16_BE, BOM_UTF8

    with open(fn, "rb") as in_file:
        bs = in_file.read()

    if bs.startswith(BOM_UTF16_LE):
        us = bs.decode("utf_16_le").lstrip(BOM_UTF16_LE.decode("utf_16_le"))
    elif bs.startswith(BOM_UTF16_BE):
        us = bs.decode("utf_16_be").lstrip(BOM_UTF16_BE.decode("utf_16_be"))
    else:
        us = bs.decode("utf_8").lstrip(BOM_UTF8.decode("utf_8"))

    return us
Ejemplo n.º 13
0
    def test_import_bom(self):
        if PY3:  # pragma: no cover
            text = (BOM_UTF8.decode('utf-8') + """\
agency_name,agency_url,agency_timezone
Demo Transit Authority,http://google.com,America/Los_Angeles
""")
        else:
            text = (BOM_UTF8 + b"""\
agency_name,agency_url,agency_timezone
Demo Transit Authority,http://google.com,America/Los_Angeles
""")
        agency_txt = StringIO(text)
        Agency.import_txt(agency_txt, self.feed)
        agency = Agency.objects.get()
        self.assertEqual(agency.agency_id, '')
        self.assertEqual(agency.name, 'Demo Transit Authority')
        self.assertEqual(agency.url, 'http://google.com')
        self.assertEqual(agency.timezone, 'America/Los_Angeles')
        self.assertEqual(agency.lang, '')
        self.assertEqual(agency.phone, '')
        self.assertEqual(agency.fare_url, '')
Ejemplo n.º 14
0
    def test_import_bom(self):
        if PY3:  # pragma: no cover
            text = (BOM_UTF8.decode('utf-8') + """\
agency_name,agency_url,agency_timezone
Demo Transit Authority,http://google.com,America/Los_Angeles
""")
        else:
            text = (BOM_UTF8 + b"""\
agency_name,agency_url,agency_timezone
Demo Transit Authority,http://google.com,America/Los_Angeles
""")
        agency_txt = StringIO(text)
        Agency.import_txt(agency_txt, self.feed)
        agency = Agency.objects.get()
        self.assertEqual(agency.agency_id, '')
        self.assertEqual(agency.name, 'Demo Transit Authority')
        self.assertEqual(agency.url, 'http://google.com')
        self.assertEqual(agency.timezone, 'America/Los_Angeles')
        self.assertEqual(agency.lang, '')
        self.assertEqual(agency.phone, '')
        self.assertEqual(agency.fare_url, '')
Ejemplo n.º 15
0
            dirname = os.readlink(dirname)
        fullname = os.path.join(dirname, filename)
        if os.path.exists(fullname):
            return fullname
    return None

# }}}

# {{{ file encoding detection
# stolen from Python 3.1's tokenize.py, by Ka-Ping Yee

import re
cookie_re = re.compile("^\s*#.*coding[:=]\s*([-\w.]+)")
from codecs import lookup, BOM_UTF8
if PY3:
    BOM_UTF8 = BOM_UTF8.decode()


def detect_encoding(readline):
    """
    The detect_encoding() function is used to detect the encoding that should
    be used to decode a Python source file. It requires one argment, readline,
    in the same way as the tokenize() generator.

    It will call readline a maximum of twice, and return the encoding used
    (as a string) and a list of any lines (left as bytes) it has read
    in.

    It detects the encoding from the presence of a utf-8 bom or an encoding
    cookie as specified in pep-0263. If both a bom and a cookie are present,
    but disagree, a SyntaxError will be raised. If the encoding cookie is an
Ejemplo n.º 16
0
decode_utf8 = decode_string
encode_utf8 = encode_string

#### CACHE #########################################################################################
# Caching is implemented in URL.download(), which is used by all other downloaders.

import os
import glob
import tempfile
import datetime

from io import open

from codecs import BOM_UTF8
BOM_UTF8 = BOM_UTF8.decode('utf-8')

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

TMP = os.path.join(tempfile.gettempdir(), "pattern_web")


def date_now():
    return datetime.datetime.today()


def date_modified(path):
    return datetime.datetime.fromtimestamp(os.stat(path)[8])
Ejemplo n.º 17
0
from itertools import chain
import os
import sys

try:
    from urllib.request import urlopen
except ImportError:
    from urllib import urlopen

from .__init__ import Graph, Node, Edge, bfs
from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS

from codecs import BOM_UTF8
if sys.version > "3":
    BOM_UTF8 = BOM_UTF8.decode("utf-8")

    basestring = str

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

#### COMMONSENSE SEMANTIC NETWORK ########################################

#--- CONCEPT -------------------------------------------------------------


class Concept(Node):
Ejemplo n.º 18
0
from parso.python.token import (tok_name, ENDMARKER, STRING, NUMBER, opmap,
                                NAME, ERRORTOKEN, NEWLINE, INDENT, DEDENT,
                                ERROR_DEDENT, FSTRING_STRING, FSTRING_START,
                                FSTRING_END)
from parso._compatibility import py_version
from parso.utils import split_lines


TokenCollection = namedtuple(
    'TokenCollection',
    'pseudo_token single_quoted triple_quoted endpats whitespace '
    'fstring_pattern_map always_break_tokens',
)

BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')

_token_collection_cache = {}

if py_version >= 30:
    # Python 3 has str.isidentifier() to check if a char is a valid identifier
    is_identifier = str.isidentifier
else:
    namechars = string.ascii_letters + '_'
    is_identifier = lambda s: s in namechars


def group(*choices, **kwargs):
    capture = kwargs.pop('capture', False)  # Python 2, arrghhhhh :(
    assert not kwargs
Ejemplo n.º 19
0
import re
from codecs import BOM_UTF8

from parso.python.tokenize import group

unicode_bom = BOM_UTF8.decode('utf-8')


class PrefixPart(object):
    def __init__(self, leaf, typ, value, spacing='', start_pos=None):
        assert start_pos is not None
        self.parent = leaf
        self.type = typ
        self.value = value
        self.spacing = spacing
        self.start_pos = start_pos

    @property
    def end_pos(self):
        if self.value.endswith('\n'):
            return self.start_pos[0] + 1, 0
        if self.value == unicode_bom:
            # The bom doesn't have a length at the start of a Python file.
            return self.start_pos
        return self.start_pos[0], self.start_pos[1] + len(self.value)

    def create_spacing_part(self):
        column = self.start_pos[1] - len(self.spacing)
        return PrefixPart(self.parent,
                          'spacing',
                          self.spacing,
Ejemplo n.º 20
0
 def __enter__(self):
     with open(self.filename, encoding='utf-8') as infile:
         content = infile.read()
     if content[0] == BOM_UTF8.decode('utf8'):
         content = content[1:]
     return content
Ejemplo n.º 21
0
 def __enter__(self):
     content = open(self.filename, encoding="utf-8").read()
     if content[0] == BOM_UTF8.decode("utf8"):
         content = content[1:]
     return content
Ejemplo n.º 22
0
import base64
import logging
from os.path import splitext
from codecs import BOM_UTF8

from lxml import objectify

from odoo import _, api, models
from odoo.exceptions import ValidationError

BOM_UTF8U = BOM_UTF8.decode('UTF-8')

_logger = logging.getLogger(__name__)


class IrAttachment(models.Model):
    _inherit = 'ir.attachment'

    def unlink(self):
        """In order to deny XML attachments deletion from an invoice, because
        the xml attachments are legal documents.
        This method validates the content of the xml in the deletion process,
        looking for a valid UUID that matches against the 'cfdi_uuid' in the
        invoice related to the attachment
        """
        self.filtered(
            lambda r: r.datas and r.res_model == 'account.move' and splitext(
                r.name)[1].lower() == '.xml').check_valid_uuid()
        return super(IrAttachment, self).unlink()

    def check_valid_uuid(self):
Ejemplo n.º 23
0
    # Python 3
    from urllib.request import urlopen

from .__init__ import Graph, Node, Edge, bfs
from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS

import os
import sys

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

if sys.version > "3":
    BOM_UTF8 = str(BOM_UTF8.decode("utf-8"))
else:
    BOM_UTF8 = BOM_UTF8.decode("utf-8")

#### COMMONSENSE SEMANTIC NETWORK ##################################################################

#--- CONCEPT ---------------------------------------------------------------------------------------


class Concept(Node):

    def __init__(self, *args, **kwargs):
        """ A concept in the sematic network.
        """
        Node.__init__(self, *args, **kwargs)
        self._properties = None
Ejemplo n.º 24
0
 def _bom(
     cls,
     txt: str,
 ) -> str:
     return "" if txt.isascii() else BOM_UTF8.decode(cls._ENCODING)
Ejemplo n.º 25
0
from itertools import chain
import os
import sys

try:
    from urllib.request import urlopen
except ImportError:
    from urllib.request import urlopen

from .__init__ import Graph, Node, Edge, bfs
from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS

from codecs import BOM_UTF8
if sys.version > "3":
    BOM_UTF8 = BOM_UTF8.decode("utf-8")

    str = str

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

#### COMMONSENSE SEMANTIC NETWORK ########################################

#--- CONCEPT -------------------------------------------------------------


class Concept(Node):
    def __init__(self, *args, **kwargs):
Ejemplo n.º 26
0
    # Python 3
    from urllib.request import urlopen

from .__init__ import Graph, Node, Edge, bfs
from .__init__ import WEIGHT, CENTRALITY, EIGENVECTOR, BETWEENNESS

import os
import sys

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

if sys.version > "3":
    BOM_UTF8 = str(BOM_UTF8.decode("utf-8"))
else:
    BOM_UTF8 = BOM_UTF8.decode("utf-8")

#### COMMONSENSE SEMANTIC NETWORK ##################################################################

#--- CONCEPT ---------------------------------------------------------------------------------------


class Concept(Node):
    def __init__(self, *args, **kwargs):
        """ A concept in the sematic network.
        """
        Node.__init__(self, *args, **kwargs)
        self._properties = None
Ejemplo n.º 27
0
decode_utf8 = decode_string
encode_utf8 = encode_string

#### CACHE #########################################################################################
# Caching is implemented in URL.download(), which is used by all other downloaders.

import os
import glob
import tempfile
import datetime

from io import open

from codecs import BOM_UTF8
BOM_UTF8 = BOM_UTF8.decode('utf-8')

try:
    MODULE = os.path.dirname(os.path.realpath(__file__))
except:
    MODULE = ""

TMP = os.path.join(tempfile.gettempdir(), "pattern_web")


def date_now():
    return datetime.datetime.today()


def date_modified(path):
    return datetime.datetime.fromtimestamp(os.stat(path)[8])
Ejemplo n.º 28
0
 def test_eval_bom(self):
     self.assertEqual(eval(BOM_UTF8 + '"foo"'), 'foo')
     # Actual BOM ignored, so causes a SyntaxError
     self.assertRaises(SyntaxError, eval,
                       BOM_UTF8.decode('iso-8859-1') + '"foo"')
Ejemplo n.º 29
0
try:
    from itertools import zip_longest
except ImportError:
    # Python 2
    from itertools import izip_longest as zip_longest

from codecs import BOM_UTF8

import pytest

import parso

unicode_bom = BOM_UTF8.decode('utf-8')


@pytest.mark.parametrize(('string', 'tokens'), [
    ('', ['']),
    ('#', ['#', '']),
    (' # ', ['# ', '']),
    (' # \n', ['# ', '\n', '']),
    (' # \f\n', ['# ', '\f', '\n', '']),
    ('  \n', ['\n', '']),
    ('  \n ', ['\n', ' ']),
    (' \f ', ['\f', ' ']),
    (' \f ', ['\f', ' ']),
    (' \r\n', ['\r\n', '']),
    ('\\\n', ['\\\n', '']),
    ('\\\r\n', ['\\\r\n', '']),
    ('\t\t\n\t', ['\n', '\t']),
])
def test_simple_prefix_splitting(string, tokens):
Ejemplo n.º 30
0
from codecs import BOM_UTF8

import agate
import json

BOM = BOM_UTF8.decode('utf-8')  # '\ufeff'

DEFAULT_TYPE_TESTER = agate.TypeTester(types=[
    agate.data_types.Number(null_values=('null', '')),
    agate.data_types.TimeDelta(null_values=('null', '')),
    agate.data_types.Date(null_values=('null', '')),
    agate.data_types.DateTime(null_values=('null', '')),
    agate.data_types.Boolean(true_values=('true', ),
                             false_values=('false', ),
                             null_values=('null', '')),
    agate.data_types.Text(null_values=('null', ''))
])


def table_from_data(data, column_names):
    "Convert list of dictionaries into an Agate table"

    # The agate table is generated from a list of dicts, so the column order
    # from `data` is not preserved. We can use `select` to reorder the columns
    #
    # If there is no data, create an empty table with the specified columns

    if len(data) == 0:
        return agate.Table([], column_names=column_names)
    else:
        table = agate.Table.from_object(data, column_types=DEFAULT_TYPE_TESTER)
Ejemplo n.º 31
0
 def __enter__(self):
     content = open(self.filename, encoding='utf-8').read()
     if content[0] == BOM_UTF8.decode('utf8'):
         content = content[1:]
     return content
Ejemplo n.º 32
0
from __future__ import absolute_import

import itertools as _itertools
import re
import sys
from codecs import BOM_UTF8
from collections import namedtuple
from dataclasses import dataclass
from typing import Dict, Generator, Iterable, Optional, Pattern, Set, Tuple

from libcst._parser.parso.python.token import PythonTokenTypes
from libcst._parser.parso.utils import PythonVersionInfo, split_lines

# Maximum code point of Unicode 6.0: 0x10ffff (1,114,111)
MAX_UNICODE = "\U0010ffff"
BOM_UTF8_STRING = BOM_UTF8.decode("utf-8")

STRING = PythonTokenTypes.STRING
NAME = PythonTokenTypes.NAME
NUMBER = PythonTokenTypes.NUMBER
OP = PythonTokenTypes.OP
NEWLINE = PythonTokenTypes.NEWLINE
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ASYNC = PythonTokenTypes.ASYNC
AWAIT = PythonTokenTypes.AWAIT
ENDMARKER = PythonTokenTypes.ENDMARKER
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
Ejemplo n.º 33
0
def lstrip_bom(str_, bom=BOM_UTF8.decode('utf-8-sig')):
    if str_.startswith(bom):
        return str_[len(bom):]
    else:
        return str_
Ejemplo n.º 34
0
    def import_txt(cls, txt_file, feed, filter_func=None):
        '''Import from the GTFS text file'''

        # Setup the conversion from GTFS to Django Format
        # Conversion functions
        def no_convert(value): return value

        def date_convert(value): return datetime.strptime(value, '%Y%m%d')

        def bool_convert(value): return (value == '1')

        def char_convert(value): return (value or '')

        def null_convert(value): return (value or None)

        def point_convert(value): return (value or 0.0)

        cache = {}

        def default_convert(field):
            def get_value_or_default(value):
                if value == '' or value is None:
                    return field.get_default()
                else:
                    return value
            return get_value_or_default

        def instance_convert(field, feed, rel_name):
            def get_instance(value):
                if value.strip():
                    key1 = "{}:{}".format(field.rel.to.__name__, rel_name)
                    key2 = text_type(value)

                    # Load existing objects
                    if key1 not in cache:
                        pairs = field.rel.to.objects.filter(
                            **{field.rel.to._rel_to_feed: feed}).values_list(
                            rel_name, 'id')
                        cache[key1] = dict((text_type(x), i) for x, i in pairs)

                    # Create new?
                    if key2 not in cache[key1]:
                        kwargs = {
                            field.rel.to._rel_to_feed: feed,
                            rel_name: value}
                        cache[key1][key2] = field.rel.to.objects.create(
                            **kwargs).id
                    return cache[key1][key2]
                else:
                    return None
            return get_instance

        # Check unique fields
        column_names = [c for c, _ in cls._column_map]
        for unique_field in cls._unique_fields:
            assert unique_field in column_names, \
                '{} not in {}'.format(unique_field, column_names)

        # Map of field_name to converters from GTFS to Django format
        val_map = dict()
        name_map = dict()
        point_map = dict()
        for csv_name, field_pattern in cls._column_map:
            # Separate the local field name from foreign columns
            if '__' in field_pattern:
                field_base, rel_name = field_pattern.split('__', 1)
                field_name = field_base + '_id'
            else:
                field_name = field_base = field_pattern
            # Use the field name in the name mapping
            name_map[csv_name] = field_name

            # Is it a point field?
            point_match = re_point.match(field_name)
            if point_match:
                field = None
            else:
                field = cls._meta.get_field_by_name(field_base)[0]

            # Pick a conversion function for the field
            if point_match:
                converter = point_convert
            elif isinstance(field, models.DateField):
                converter = date_convert
            elif isinstance(field, models.BooleanField):
                converter = bool_convert
            elif isinstance(field, models.CharField):
                converter = char_convert
            elif field.rel:
                converter = instance_convert(field, feed, rel_name)
                assert not isinstance(field, models.ManyToManyField)
            elif field.null:
                converter = null_convert
            elif field.has_default():
                converter = default_convert(field)
            else:
                converter = no_convert

            if point_match:
                index = int(point_match.group('index'))
                point_map[csv_name] = (index, converter)
            else:
                val_map[csv_name] = converter

        # Read and convert the source txt
        csv_reader = reader(txt_file)
        unique_line = dict()
        count = 0
        first = True
        extra_counts = defaultdict(int)
        if PY3:  # pragma: no cover
            bom = BOM_UTF8.decode('utf-8')
        else:  # pragma: no cover
            bom = BOM_UTF8
        new_objects = []
        for row in csv_reader:
            if first:
                # Read the columns
                columns = row
                if columns[0].startswith(bom):
                    columns[0] = columns[0][len(bom):]
                first = False
                continue

            if filter_func and not filter_func(zip(columns, row)):
                continue

            # Read a data row
            fields = dict()
            point_coords = [None, None]
            ukey_values = {}
            if cls._rel_to_feed == 'feed':
                fields['feed'] = feed
            for column_name, value in zip(columns, row):
                if column_name not in name_map:
                    val = null_convert(value)
                    if val is not None:
                        fields.setdefault('extra_data', {})[column_name] = val
                        extra_counts[column_name] += 1
                elif column_name in val_map:
                    fields[name_map[column_name]] = val_map[column_name](value)
                else:
                    assert column_name in point_map
                    pos, converter = point_map[column_name]
                    point_coords[pos] = converter(value)

                # Is it part of the unique key?
                if column_name in cls._unique_fields:
                    ukey_values[column_name] = value

            # Join the lat/long into a point
            if point_map:
                assert point_coords[0] and point_coords[1]
                fields['point'] = "POINT(%s)" % (' '.join(point_coords))

            # Is the item unique?
            """
            ukey = tuple(ukey_values.get(u) for u in cls._unique_fields)
            if ukey in unique_line:
                logger.warning(
                    '%s line %d is a duplicate of line %d, not imported.',
                    cls._filename, csv_reader.line_num, unique_line[ukey])
                continue
            else:
                unique_line[ukey] = csv_reader.line_num
            """
            # Create after accumulating a batch
            new_objects.append(cls(**fields))
            if len(new_objects) % batch_size == 0:  # pragma: no cover
                cls.objects.bulk_create(new_objects)
                count += len(new_objects)
                logger.info(
                    "Imported %d %s",
                    count, cls._meta.verbose_name_plural)
                new_objects = []

        # Create remaining objects
        if new_objects:
            cls.objects.bulk_create(new_objects)

        # Take note of extra fields
        if extra_counts:
            extra_columns = feed.meta.setdefault(
                'extra_columns', {}).setdefault(cls.__name__, [])
            for column in columns:
                if column in extra_counts and column not in extra_columns:
                    extra_columns.append(column)
            feed.save()
        return len(unique_line)
Ejemplo n.º 35
0
from datetime import datetime, date
from io import StringIO
from logging import getLogger
from csv import reader, writer
from django.contrib.gis.db import models
from django.db.models.fields.related import ManyToManyField
from django.utils.six import StringIO, text_type, PY3
from multigtfs.compat import (get_blank_value, write_text_rows, Manager,
                              QuerySet)

logger = getLogger(__name__)
re_geom = re.compile(r'(?P<name>geom)\[(?P<index>\d)\]')

batch_size = 1000
large_queryset_size = 100000
CSV_BOM = BOM_UTF8.decode('utf-8') if PY3 else BOM_UTF8


class BaseQuerySet(QuerySet):
    def populated_column_map(self):
        '''Return the _column_map without unused optional fields'''
        column_map = []
        cls = self.model
        for csv_name, field_pattern in cls._column_map:
            # Separate the local field name from foreign columns
            if '__' in field_pattern:
                field_name = field_pattern.split('__', 1)[0]
            else:
                field_name = field_pattern

            # Handle point fields
Ejemplo n.º 36
0
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ENDMARKER = PythonTokenTypes.ENDMARKER
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
FSTRING_END = PythonTokenTypes.FSTRING_END

TokenCollection = namedtuple(
    'TokenCollection',
    'pseudo_token single_quoted triple_quoted endpats whitespace '
    'fstring_pattern_map always_break_tokens',
)

BOM_UTF8_STRING = BOM_UTF8.decode('utf-8')

_token_collection_cache = {}

if py_version >= 30:
    # Python 3 has str.isidentifier() to check if a char is a valid identifier
    is_identifier = str.isidentifier
else:
    namechars = string.ascii_letters + '_'
    is_identifier = lambda s: s in namechars


def group(*choices, **kwargs):
    capture = kwargs.pop('capture', False)  # Python 2, arrghhhhh :(
    assert not kwargs
Ejemplo n.º 37
0
from datetime import datetime, date
from logging import getLogger
import re

from django.contrib.gis.db import models
from django.contrib.gis.db.models.query import GeoQuerySet
from django.db.models.fields.related import ManyToManyField
from django.utils.six import StringIO, text_type, PY3

from multigtfs.compat import get_blank_value, write_text_rows

logger = getLogger(__name__)
re_point = re.compile(r'(?P<name>point)\[(?P<index>\d)\]')
batch_size = 1000
large_queryset_size = 100000
CSV_BOM = BOM_UTF8.decode('utf-8') if PY3 else BOM_UTF8


class BaseQuerySet(GeoQuerySet):
    def populated_column_map(self):
        '''Return the _column_map without unused optional fields'''
        column_map = []
        cls = self.model
        for csv_name, field_pattern in cls._column_map:
            # Separate the local field name from foreign columns
            if '__' in field_pattern:
                field_name = field_pattern.split('__', 1)[0]
            else:
                field_name = field_pattern

            # Handle point fields
Ejemplo n.º 38
0
        fullname = os.path.join(dirname, filename)
        if os.path.exists(fullname):
            return fullname
    return None


# }}}

# {{{ file encoding detection
# the main idea stolen from Python 3.1's tokenize.py, by Ka-Ping Yee

import re
cookie_re = re.compile("^\s*#.*coding[:=]\s*([-\w.]+)")
from codecs import lookup, BOM_UTF8
if PY3:
    BOM_UTF8 = BOM_UTF8.decode()


def detect_encoding(lines):
    """
    The detect_encoding() function is used to detect the encoding that should
    be used to decode a Python source file. It requires one argment, lines,
    iterable lines stream.

    It will read a maximum of two lines, and return the encoding used
    (as a string) and a list of any lines (left as bytes) it has read
    in.

    It detects the encoding from the presence of a utf-8 bom or an encoding
    cookie as specified in pep-0263. If both a bom and a cookie are present,
    but disagree, a SyntaxError will be raised. If the encoding cookie is an
Ejemplo n.º 39
0
# Fixed XML when is not decoded
import oerplib
import argparse
import base64
from lxml import objectify
from codecs import BOM_UTF8
BOM_UTF8U = BOM_UTF8.decode('UTF-8')

PARSER = argparse.ArgumentParser()
PARSER.add_argument("-d", "--db", help="DataBase Name", required=True)
PARSER.add_argument("-r", "--user", help="OpenERP User", required=True)
PARSER.add_argument("-w", "--passwd", help="OpenERP Password", required=True)
PARSER.add_argument("-p", "--port",
                    type=int,
                    help="Port, 8069 for default", default="8069")
PARSER.add_argument("-s", "--server",
                    help="Server IP, 127.0.0.1 for default",
                    default="127.0.0.1")
ARGS = PARSER.parse_args()

if ARGS.db is None or ARGS.user is None or ARGS.passwd is None:
    print "Must be specified DataBase, User and Password"
    quit()

DB_NAME = ARGS.db
USER = ARGS.user
PASSW = ARGS.passwd
SERVER = ARGS.server
PORT = ARGS.port

OERP_CONNECT = oerplib.OERP(SERVER,