Exemple #1
0
 def apparent_encoding(self):
     try:
         import chardet
         chardet_version = chardet.__version__
         major, minor, patch = chardet_version.split('.')[:3]
         major, minor, patch = int(major), int(minor), int(patch)
         # chardet >= 3.0.2, < 3.1.0
         assert major == 3
         assert minor < 1
         assert patch >= 2
     except (ImportError, AssertionError) as e:
         return check_encoding(self.content)
     else:
         return chardet.detect(self.content)['encoding']
Exemple #2
0
def guess_field_types(request):
    file_format = json.loads(request.POST.get('fileFormat', '{}'))

    if file_format['inputFormat'] == 'file':
        indexer = MorphlineIndexer(request.user, request.fs)
        path = urllib_unquote(file_format["path"])
        stream = request.fs.open(path)
        encoding = check_encoding(stream.read(10000))
        stream.seek(0)
        _convert_format(file_format["format"], inverse=True)

        format_ = indexer.guess_field_types({
            "file": {
                "stream": stream,
                "name": path
            },
            "format": file_format['format']
        })

        # Note: Would also need to set charset to table (only supported in Hive)
        if 'sample' in format_ and format_['sample']:
            format_['sample'] = escape_rows(format_['sample'],
                                            nulls_only=True,
                                            encoding=encoding)
        for col in format_['columns']:
            col['name'] = smart_unicode(col['name'],
                                        errors='replace',
                                        encoding=encoding)

    elif file_format['inputFormat'] == 'table':
        sample = get_api(request, {
            'type': 'hive'
        }).get_sample_data({'type': 'hive'},
                           database=file_format['databaseName'],
                           table=file_format['tableName'])
        db = dbms.get(request.user)
        table_metadata = db.get_table(database=file_format['databaseName'],
                                      table_name=file_format['tableName'])

        format_ = {
            "sample":
            sample['rows'][:4],
            "columns": [
                Field(col.name,
                      HiveFormat.FIELD_TYPE_TRANSLATE.get(col.type,
                                                          'string')).to_dict()
                for col in table_metadata.cols
            ]
        }
    elif file_format['inputFormat'] == 'query':
        query_id = file_format['query']['id'] if file_format['query'].get(
            'id') else file_format['query']

        notebook = Notebook(document=Document2.objects.document(
            user=request.user, doc_id=query_id)).get_data()
        snippet = notebook['snippets'][0]
        db = get_api(request, snippet)

        if file_format.get('sampleCols'):
            columns = file_format.get('sampleCols')
            sample = file_format.get('sample')
        else:
            snippet['query'] = snippet['statement']
            try:
                sample = db.fetch_result(notebook, snippet, 4,
                                         start_over=True)['rows'][:4]
            except Exception as e:
                LOG.warning(
                    'Skipping sample data as query handle might be expired: %s'
                    % e)
                sample = [[], [], [], [], []]
            columns = db.autocomplete(snippet=snippet, database='', table='')
            columns = [
                Field(
                    col['name'],
                    HiveFormat.FIELD_TYPE_TRANSLATE.get(col['type'],
                                                        'string')).to_dict()
                for col in columns['extended_columns']
            ]
        format_ = {
            "sample": sample,
            "columns": columns,
        }
    elif file_format['inputFormat'] == 'rdbms':
        api = _get_api(request)
        sample = api.get_sample_data(None,
                                     database=file_format['rdbmsDatabaseName'],
                                     table=file_format['tableName'])

        format_ = {
            "sample":
            list(sample['rows'])[:4],
            "columns": [
                Field(col['name'], col['type']).to_dict()
                for col in sample['full_headers']
            ]
        }
    elif file_format['inputFormat'] == 'stream':
        if file_format['streamSelection'] == 'kafka':
            data = get_topic_data(request.user,
                                  file_format.get('kafkaSelectedTopics'))

            kafkaFieldNames = [col['name'] for col in data['full_headers']]
            kafkaFieldTypes = [col['type'] for col in data['full_headers']]
            topics_data = data['rows']

            format_ = {
                "sample":
                topics_data,
                "columns": [
                    Field(col, 'string', unique=False).to_dict()
                    for col in kafkaFieldNames
                ]
            }


#       data = """%(kafkaFieldNames)s
# %(data)s""" % {
#         'kafkaFieldNames': ','.join(kafkaFieldNames),
#         'data': '\n'.join([','.join(cols) for cols in topics_data])
#       }
#       stream = string_io()
#       stream.write(data)

#       _convert_format(file_format["format"], inverse=True)

#       indexer = MorphlineIndexer(request.user, request.fs)

#       format_ = indexer.guess_field_types({
#         "file": {
#             "stream": stream,
#             "name": file_format['path']
#         },
#         "format": file_format['format']
#       })
#       type_mapping = dict(
#         list(
#           zip(kafkaFieldNames, kafkaFieldTypes)
#         )
#       )

#       for col in format_['columns']:
#         col['keyType'] = type_mapping[col['name']]
#         col['type'] = type_mapping[col['name']]
        elif file_format['streamSelection'] == 'flume':
            if 'hue-httpd/access_log' in file_format['channelSourcePath']:
                columns = [{
                    'name': 'id',
                    'type': 'string',
                    'unique': True
                }, {
                    'name': 'client_ip',
                    'type': 'string'
                }, {
                    'name': 'time',
                    'type': 'date'
                }, {
                    'name': 'request',
                    'type': 'string'
                }, {
                    'name': 'code',
                    'type': 'plong'
                }, {
                    'name': 'bytes',
                    'type': 'plong'
                }, {
                    'name': 'method',
                    'type': 'string'
                }, {
                    'name': 'url',
                    'type': 'string'
                }, {
                    'name': 'protocol',
                    'type': 'string'
                }, {
                    'name': 'app',
                    'type': 'string'
                }, {
                    'name': 'subapp',
                    'type': 'string'
                }]
            else:
                columns = [{'name': 'message', 'type': 'string'}]

            format_ = {
                "sample": [['...'] * len(columns)] * 4,
                "columns": [
                    Field(col['name'],
                          HiveFormat.FIELD_TYPE_TRANSLATE.get(
                              col['type'], 'string'),
                          unique=col.get('unique')).to_dict()
                    for col in columns
                ]
            }
    elif file_format['inputFormat'] == 'connector':
        if file_format['connectorSelection'] == 'sfdc':
            sf = Salesforce(username=file_format['streamUsername'],
                            password=file_format['streamPassword'],
                            security_token=file_format['streamToken'])
            table_metadata = [{
                'name': column['name'],
                'type': column['type']
            } for column in sf.restful('sobjects/%(streamObject)s/describe/' %
                                       file_format)['fields']]
            query = 'SELECT %s FROM %s LIMIT 4' % (', '.join(
                [col['name']
                 for col in table_metadata]), file_format['streamObject'])
            print(query)

            try:
                records = sf.query_all(query)
            except SalesforceRefusedRequest as e:
                raise PopupException(message=str(e))

            format_ = {
                "sample":
                [list(row.values())[1:] for row in records['records']],
                "columns": [
                    Field(
                        col['name'],
                        HiveFormat.FIELD_TYPE_TRANSLATE.get(
                            col['type'], 'string')).to_dict()
                    for col in table_metadata
                ]
            }
        else:
            raise PopupException(
                _('Connector format not recognized: %(connectorSelection)s') %
                file_format)
    else:
        raise PopupException(
            _('Input format not recognized: %(inputFormat)s') % file_format)

    return JsonResponse(format_)
Exemple #3
0
def importer_submit(request):
    source = json.loads(request.POST.get('source', '{}'))
    outputFormat = json.loads(request.POST.get('destination',
                                               '{}'))['outputFormat']
    destination = json.loads(request.POST.get('destination', '{}'))
    destination['ouputFormat'] = outputFormat  # Workaround a very weird bug
    start_time = json.loads(request.POST.get('start_time', '-1'))

    file_encoding = None
    if source['inputFormat'] == 'file':
        if source['path']:
            path = urllib_unquote(source['path'])
            if path[-3:] == 'xls' or path[-4:] == 'xlsx':
                path = excel_to_csv_file_name_change(path)
            source['path'] = request.fs.netnormpath(path)
            stream = request.fs.open(path)
            file_encoding = check_encoding(stream.read(10000))

    if destination['ouputFormat'] in ('database',
                                      'table') and request.fs is not None:
        destination['nonDefaultLocation'] = request.fs.netnormpath(destination['nonDefaultLocation']) \
            if destination['nonDefaultLocation'] else destination['nonDefaultLocation']

    if destination['ouputFormat'] == 'index':
        source['columns'] = destination['columns']
        index_name = destination["name"]

        if destination['indexerRunJob'] or source['inputFormat'] == 'stream':
            _convert_format(source["format"], inverse=True)
            job_handle = _large_indexing(
                request,
                source,
                index_name,
                start_time=start_time,
                lib_path=destination['indexerJobLibPath'],
                destination=destination)
        else:
            client = SolrClient(request.user)
            job_handle = _small_indexing(request.user, request.fs, client,
                                         source, destination, index_name)
    elif destination['ouputFormat'] == 'stream-table':
        args = {
            'source': source,
            'destination': destination,
            'start_time': start_time,
            'dry_run': request.POST.get('show_command')
        }
        api = FlinkIndexer(request.user, request.fs)

        job_nb = api.create_table_from_kafka(**args)

        if request.POST.get('show_command'):
            job_handle = {'status': 0, 'commands': job_nb}
        else:
            job_handle = job_nb.execute(request, batch=False)
    elif source['inputFormat'] == 'altus':
        # BDR copy or DistCP + DDL + Sentry DDL copy
        pass
    elif source['inputFormat'] == 'rdbms':
        if destination['outputFormat'] in ('database', 'file', 'table',
                                           'hbase'):
            job_handle = run_sqoop(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'database':
        job_handle = _create_database(request, source, destination, start_time)
    elif destination['ouputFormat'] == 'big-table':
        args = {
            'request': request,
            'source': source,
            'destination': destination,
            'start_time': start_time,
            'dry_run': request.POST.get('show_command')
        }
        api = PhoenixIndexer(request.user, request.fs)

        job_nb = api.create_table_from_file(**args)

        if request.POST.get('show_command'):
            job_handle = {'status': 0, 'commands': job_nb}
        else:
            job_handle = job_nb.execute(request, batch=False)
    else:
        if source['inputFormat'] == 'localfile':
            job_handle = _create_table_from_local(request, source, destination,
                                                  start_time)
        else:
            # TODO: if inputFormat is 'stream' and tableFormat is 'kudu' --> create Table only
            job_handle = _create_table(request, source, destination,
                                       start_time, file_encoding)

    request.audit = {
        'operation': 'EXPORT',
        'operationText':
        'User %(username)s exported %(inputFormat)s to %(ouputFormat)s: %(name)s'
        % {
            'username': request.user.username,
            'inputFormat': source['inputFormat'],
            'ouputFormat': destination['ouputFormat'],
            'name': destination['name'],
        },
        'allowed': True
    }

    return JsonResponse(job_handle)
Exemple #4
0
    def test_check_encoding(self):
        shift_jis = u"""
都道府県,都道府県庁所在地,人口,面積,人口密度,集積度,備考
北海道,札幌市の市旗(北海道) 札幌市,"1,961,690","1,121.26","1,750",37.5,政令指定都市
青森県,青森市の市旗(青森県) 青森市,"272,565",824.61,331,22.2,中核市
岩手県,盛岡市の市旗(岩手県) 盛岡市,"290,700",886.47,328,24.1,中核市
宮城県,仙台市の市旗(宮城県) 仙台市,"1,092,317",786.3,"1,389",47.7,政令指定都市
秋田県,秋田市の市旗(秋田県) 秋田市[14],"303,337",906.07,335,32,中核市
山形県,山形市の市旗(山形県) 山形市[14],"247,422",381.3,649,23.3,中核市
福島県,福島市の市旗(福島県) 福島市,"284,646",767.72,371,15.6,中核市
茨城県,水戸市の市旗(茨城県) 水戸市[14],"269,186",217.32,"1,239",9.4,中核市
栃木県,宇都宮市の市旗(栃木県) 宇都宮市,"518,443",416.85,"1,244",26.9,中核市
群馬県,前橋市の市旗(群馬県) 前橋市,"331,695",311.59,"1,065",17.2,中核市
埼玉県,さいたま市の市旗(埼玉県) さいたま市,"1,320,197",217.43,"6,072",18,政令指定都市
千葉県,千葉市の市旗(千葉県) 千葉市,"981,871",271.77,"3,613",15.6,政令指定都市
東京都,新宿区の区旗(東京都) 新宿区[7][13][14],"346,735",18.22,"19,030",2.5,特別区
,東京都区部(東京)[7],"9,650,247",627.57[15],"15,377",69.2,旧東京市
神奈川県,横浜市の市旗(神奈川県) 横浜市,"3,757,630",437.56,"8,588",40.8,政令指定都市
新潟県,新潟市の市旗(新潟県) 新潟市[14],"791,326",726.45,"1,089",36.1,政令指定都市
富山県,富山市の市旗(富山県) 富山市,"413,723","1,241.77",333,40.1,中核市
石川県,金沢市の市旗(石川県) 金沢市[14],"462,479",468.64,987,41,中核市
福井県,福井市の市旗(福井県) 福井市,"260,807",536.41,486,34.3,中核市
山梨県,甲府市の市旗(山梨県) 甲府市,"187,316",212.47,882,23.3,中核市
長野県,長野市の市旗(長野県) 長野市,"367,582",834.81,440,18.1,中核市
岐阜県,岐阜市の市旗(岐阜県) 岐阜市[14],"400,118",203.6,"1,965",20.3,中核市
静岡県,静岡市の市章(静岡県) 静岡市,"686,085","1,411.90",486,19,政令指定都市
愛知県,名古屋市の市旗(愛知県) 名古屋市,"2,327,723",326.45,"7,130",30.9,政令指定都市
三重県,津市の市旗(三重県) 津市,"273,267",711.19,384,15.5,?
滋賀県,大津市の市旗(滋賀県) 大津市,"342,716",464.51,738,24.3,中核市
京都府,京都市の市旗(京都府) 京都市,"1,455,377",827.83,"1,758",56.8,政令指定都市
大阪府,大阪市の市旗(大阪府) 大阪市,"2,753,476",225.21,"12,226",31.2,政令指定都市
兵庫県,神戸市の市旗(兵庫県) 神戸市,"1,514,434",557.02,"2,719",27.9,政令指定都市
奈良県,奈良市の市旗(奈良県) 奈良市[14],"352,571",276.94,"1,273",26.7,中核市
和歌山県,和歌山市の市旗(和歌山県) 和歌山市,"353,486",208.84,"1,693",38.8,中核市
鳥取県,鳥取市の市旗(鳥取県) 鳥取市,"187,442",765.31,245,34.1,中核市
島根県,松江市の市旗(島根県) 松江市,"202,008",572.99,353,30.4,中核市
岡山県,岡山市の市旗(岡山県) 岡山市,"720,456",789.95,912,38.3,政令指定都市
広島県,広島市の市旗(広島県) 広島市[14],"1,198,224",906.68,"1,322",42.9,政令指定都市
山口県,山口市の市旗(山口県) 山口市,"193,796","1,023.23",189,14.5,?
徳島県,徳島市の市旗(徳島県) 徳島市[14],"254,510",191.39,"1,330",35.3,?
香川県,高松市の市旗(香川県) 高松市,"417,814",375.41,"1,113",44,中核市
愛媛県,松山市の市旗(愛媛県) 松山市,"506,749",429.4,"1,180",38.3,中核市
高知県,高知市の市旗(高知県) 高知市,"325,807",309,"1,054",47.4,中核市
福岡県,福岡市の市旗(福岡県) 福岡市,"1,603,043",343.39,"4,668",31.4,政令指定都市
佐賀県,佐賀市の市旗(佐賀県) 佐賀市,"232,485",431.84,538,28.8,施行時特例市
長崎県,長崎市の市旗(長崎県) 長崎市,"405,090",405.86,998,31,中核市
熊本県,熊本市の市旗(熊本県) 熊本市,"738,469",390.32,"1,892",42.6,政令指定都市
大分県,大分市の市旗(大分県) 大分市,"477,354",502.38,950,42.5,中核市
宮崎県,宮崎市の市旗(宮崎県) 宮崎市,"397,560",643.67,618,37.5,中核市
鹿児島県,鹿児島市の市旗(鹿児島県) 鹿児島市[14],"593,808",547.58,"1,084",37.4,中核市
沖縄県,那覇市の市旗(沖縄県) 那覇市[14],"316,196",39.98,"7,909",21.6,中核市
    """

        gb2312 = u"""
Before,Chinese,After,Chinese,Renamed date
Weihaiwei,威海卫市,Weihai,威海市,1949-11-01
Xingshan,兴山市,Hegang,鹤岗市,1950-03-23
Xi'an,西安市,Liaoyuan,辽源市,1952-04-03
Nanzheng,南郑市,Hanzhong,汉中市,1953-10-24
Dihua,迪化市,?rümqi,乌鲁木齐市,1953-11-20
Guisui,归绥市,Hohhot,呼和浩特市,1954-04-20
Xinhailian,新海连市,Lianyungang,连云港市,1961-09-02
Andong,安东市,Dandong,丹东市,1965-01-20
Suixi,濉溪市,Huaibei,淮北市,1971-03-30
Anda,安达市,Daqing,大庆市,1979-12-14
Sucheng,宿城市,Suzhou,宿州市,1980-02-29
Lüda,旅大市,Dalian,大连市,1981-02-09
Dukou,渡口市,Panzhihua,攀枝花市,1987-01-23
Meixian,梅县市,Meizhou,梅州市,1988-01-07
Daxian,达县市,Dazhou,达州市,1993-07-05
Hunjiang,浑江市,Baishan,白山市,1994-01-31
Dayong,大庸市,Zhangjiajie,张家界市,1994-04-04
Jinxi,锦西市,Huludao,葫芦岛市,1994-09-20
Jingsha,荆沙市,Jingzhou,荆州市,1996-11-20
Puqi,蒲圻市,Chibi,赤壁市,1998-06-11
Zhicheng,枝城市,Yidu,宜都市,1998-06-11
Huaiyin,淮阴市,Huai'an,淮安市,2000-12-21
Tongza,通什市,Wuzhishan,五指山市,2001-07-05
Tiefa,铁法市,Diaobingshan,调兵山市,2002-02-20
Beining,北宁市,Beizhen,北镇市,2006-02-08
Simao,思茅市,Pu'er,普洱市,2007-01-21
Luxi,潞西市,Mangshi,芒市,2007-12-30
Xiangfan,襄樊市,Xiangyang,襄阳市,2010-11-26
  """
        big_5 = u"""
Name,Chinese,County,Founded,"Population"
Kaohsiung,高雄市,none,1979-07-01,"1,402,914"
New Taipei,新北市,none,2010-12-25,"3,974,911"
Taichung,臺中市,none,2010-12-25,"2,759,887"
Tainan,臺南市,none,2010-12-25,"1,885,499"
Taipei,臺北市,none,1967-07-01,"2,696,316"
Taoyuan,桃園市,none,2014-12-25,"2,136,702"
Chiayi,嘉義市,none,1982-07-01,"269,890"
Hsinchu,新竹市,none,1982-07-01,"436,220"
Keelung,基隆市,none,1945-10-25,"372,019"
Changhua,彰化市,Changhua,1951-12-01,"234,721"
Douliu,斗六市,Yunlin,1981-12-25,"108,098"
Hualien,花蓮市,Hualien,1946-01-16,"106,368"
Magong,馬公市,Penghu,1981-12-25,"60,335"
Miaoli,苗栗市,Miaoli,1981-12-25,"90,963"
Nantou,南投市,Nantou,1981-12-25,"102,314"
Pingtung,屏東市,Pingtung,1951-12-01,"203,866"
Puzi,朴子市,Chiayi,1992-09-10,"43,250"
Taibao,太保市,Chiayi,1991-07-01,"37,038"
Taitung,臺東市,Taitung,1976-01-01,"106,969"
Toufen,頭份市,Miaoli,2015-10-05,"102,654"
Yilan,宜蘭市,Yilan,1946-01-16,"95,879"
Yuanlin,員林市,Changhua,2015-08-08,"124,730"
Zhubei,竹北市,Hsinchu,1988-10-31,"203,195"
    """

        euc_kr = u"""
Before,Hangul,Hanja,After,Hangul,Hanja,Renamed date
Chungmu,충무시,忠武市,Tongyeong,통영시,統營市,1995-01-01
Daecheon,대천시,大川市,Boryeong,보령시,保寧市,1995-01-01
Donggwangyang,동광양시,東光陽市,Gwangyang,광양시,光陽市,1995-01-01
Geumseong,금성시,錦城市,Naju,나주시,羅州市,1986-01-01
Gyeongseong,경성부,京城府,Seoul,서울특별자유시,特別自由市,1946-08-16
Iri,이리시,裡里市,Iksan,익산시,益山市,1995-05-10
Jangseungpo,장승포시,長承浦市,Geoje,거제시,巨濟市,1995-01-01
Jeomchon,점촌시,店村市,Mungyeong,문경시,聞慶市,1995-01-01
Jeongju,정주시,井州市,Jeongeup,정읍시,井邑市,1995-01-01
Migeum,미금시,渼金市,Namyangju,남양주시,南楊州市,1995-01-01
Onyang,온양시,溫陽市,Asan,아산시,牙山市,1995-01-01
Samcheonpo,삼천포시,三千浦市,Sacheon,사천시,泗川市,1995-05-10
    """

        iso_8859 = u"""
ID,Commune,Département,Statut,Région,2017,2016,2014,2013,2006,1999,1990,1982,1975,1968
1,Maubeuge,Nord,--,Hauts-de-France,29 944,29 679,30 347,30 567,32 699,33 546,34 989,36 061,35 399,32 028
2,Aix-les-Bains,Savoie,--,Auvergne-Rhône-Alpes,29 794,29 799,30 291,29 580,27 375,25 782,24 683,23 451,22 210,20 627
3,Mont-de-Marsan,Landes,Préfecture,Nouvelle-Aquitaine,29 554,29 885,31 009,31 334,30 230,29 489,28 328,27 326,26 166,24 444
4,Clichy-sous-Bois,Seine-Saint-Denis,--,Île-de-France,29 348,29 835,29 933,30 725,29 412,28 288,28 180,24 654,22 422,16 357
5,Vienne,Isère,Sous-préfecture,Auvergne-Rhône-Alpes,29 306,29 454,29 096,29 325,30 092,29 975,29 449,28 294,27 830,29 057
6,Dieppe,Seine-Maritime,Sous-préfecture,Normandie,29 080,29 606,30 086,30 214,33 618,34 653,35 894,35 957,39 466,30 016
7,Sotteville-lès-Rouen,Seine-Maritime,--,Normandie,28 965,28 991,28 910,28 704,30 076,29 553,29 544,30 558,31 659,34 495
8,Saint-Étienne-du-Rouvray,Seine-Maritime,--,Normandie,28 641,28 696,28 752,28 738,27 815,29 092,30 731,32 444,37 242,34 713
9,Soissons,Aisne,Sous-préfecture,Hauts-de-France,28 530,28 466,28 290,28 472,28 442,29 453,29 829,30 213,30 009,25 890
10,Saint-Laurent-du-Var,Alpes-Maritimes,--,Provence-Alpes-Côte d'Azur,28 453,28 645,29 067,28 891,30 076,27 141,24 426,20 678,15 503,10 156
11,Saumur,Maine-et-Loire,Sous-préfecture,Pays de la Loire,26 734,27 125,27 301,27 413,28 654,28 935,30 131,32 149,32 515,31 629
12,Vallauris,Alpes-Maritimes,--,Provence-Alpes-Côte d'Azur,26 672,26 618,26 302,27 465,30 610,25 773,24 325,21 205,17 182,12 880
13,Vierzon,Cher,Sous-préfecture,Centre-Val de Loire,25 903,26 365,27 050,27 113,28 147,29 719,32 235,34 209,35 699,33 775
14,Alençon,Orne,Préfecture,Normandie,25 848,26 129,26 028,26 350,28 458,28 935,29 988,31 608,33 680,31 656
15,Le Grand-Quevilly,Seine-Maritime,--,Normandie,25 698,25 897,25 273,24 967,26 226,26 679,27 658,31 650,31 963,25 611
16,Aurillac,Cantal,Préfecture,Auvergne-Rhône-Alpes,25 499,25 954,26 135,26 572,29 477,30 551,30 773,30 963,30 863,28 226
17,Biarritz,Pyrénées-Atlantiques,--,Nouvelle-Aquitaine,25 404,24 777,24 713,24 993,26 690,30 055,28 742,26 598,27 595,26 750
18,Montbéliard,Doubs,Sous-préfecture,Bourgogne-Franche-Comté,25 395,25 304,25 521,25 697,26 535,27 570,29 005,31 836,30 425,23 908
19,Vichy,Allier,Sous-préfecture,Auvergne-Rhône-Alpes,24 166,24 383,26 279,25 325,26 108,26 528,27 714,30 527,32 117,33 506
20,Saint-Dizier,Haute-Marne,Sous-préfecture,Grand Est,24 012,24 932,25 505,25 626,26 972,30 900,33 552,35 189,37 266,36 616
21,Orly,Val-de-Marne,--,Île-de-France,23 801,23 378,22 603,22 377,21 197,20 470,21 646,23 766,26 104,30 197
22,Bruay-la-Buissière,Pas-de-Calais,--,Hauts-de-France,21 831,22 230,22 579,22 802,23 813,23 998,24 927,26 649,29 435,32 341
23,Le Creusot,Saône-et-Loire,--,Bourgogne-Franche-Comté,21 630,21 752,21 991,22 308,23 813,26 283,28 909,32 149,33 366,34 102
    """

        cp1252 = u"""
Rank,City/town,Russian,Federal subject,Federal district,Population,Change
1,Moscow,Москва,Moscow (federal city)[3],Central,"12,480,481",8.49%
2,Saint Petersburg,Санкт-Петербург,Saint Petersburg (federal city)[4],Northwest,"5,398,064",10.63%
3,Novosibirsk,Новосибирск,Novosibirsk Oblast,Siberia,"1,625,631",10.31%
4,Yekaterinburg,Екатеринбург,Sverdlovsk Oblast,Ural,"1,493,749",10.67%
5,Kazan,Казань,Republic of Tatarstan,Volga,"1,257,391",9.96%
6,Nizhny Novgorod,Нижний Новгород,Nizhny Novgorod Oblast,Volga,"1,252,236",0.13%
7,Chelyabinsk,Челябинск,Chelyabinsk Oblast,Ural,"1,196,680",5.89%
8,Samara,Самара,Samara Oblast,Volga,"1,156,659",-0.69%
9,Omsk,Омск,Omsk Oblast,Siberia,"1,154,507",0.03%
10,Rostov-on-Don,Ростов-на-Дону,Rostov Oblast,South[5],"1,137,904",4.47%
11,Ufa,Уфа,Republic of Bashkortostan,Volga,"1,128,787",6.26%
12,Krasnoyarsk,Красноярск,Krasnoyarsk Krai,Siberia,"1,093,771",12.32%
13,Voronezh,Воронеж,Voronezh Oblast,Central,"1,058,261",18.95%
14,Perm,Пермь,Perm Krai,Volga,"1,055,397",6.48%
15,Volgograd,Волгоград,Volgograd Oblast,South,"1,008,998",-1.20%
16,Krasnodar,Краснодар,Krasnodar Krai,South,"932,629",25.19%
17,Saratov,Саратов,Saratov Oblast,Volga,"838,042",0.02%
18,Tyumen,Тюмень,Tyumen Oblast,Ural,"807,271",38.73%
19,Tolyatti,Тольятти,Samara Oblast,Volga,"699,429",-2.81%
20,Izhevsk,Ижевск,Udmurt Republic,Volga,"648,146",3.25%
21,Barnaul,Барнаул,Altai Krai,Siberia,"632,391",3.26%
22,Ulyanovsk,Ульяновск,Ulyanovsk Oblast,Volga,"627,705",2.10%
23,Irkutsk,Иркутск,Irkutsk Oblast,Siberia,"623,562",6.07%
24,Khabarovsk,Хабаровск,Khabarovsk Krai,Far East,"616,372",6.74%
25,Yaroslavl,Ярославль,Yaroslavl Oblast,Central,"608,353",2.85%
26,Vladivostok,Владивосток,Primorsky Krai,Far East,"606,561",2.45%
27,Makhachkala,Махачкала,Republic of Dagestan,North Caucasus,"603,518",5.50%
28,Tomsk,Томск,Tomsk Oblast,Siberia,"576,624",9.90%
29,Orenburg,Оренбург,Orenburg Oblast,Volga,"572,188",4.35%
30,Kemerovo,Кемерово,Kemerovo Oblast,Siberia,"556,382",4.39%
31,Novokuznetsk,Новокузнецк,Kemerovo Oblast,Siberia,"549,403",0.27%
32,Ryazan,Рязань,Ryazan Oblast,Central,"539,290",2.74%
33,Naberezhnye Chelny,Набережные ЧелныRepublic of Tatarstan,Volga,"533,839",4.02%
34,Astrakhan,Астрахань,Astrakhan Oblast,South,"529,793",1.82%
35,Penza,Пенза,Penza Oblast,Volga,"520,300",0.58%
36,Kirov,Киров,Kirov Oblast,Volga,"518,348",9.43%
37,Lipetsk,Липецк,Lipetsk Oblast,Central,"508,573",-0.06%
38,Balashikha,Балашиха,Moscow Oblast,Central,"507,366",135.44%
39,Cheboksary,Чебоксары,Chuvash Republic,Volga,"497,618",9.67%
40,Kaliningrad,Калининград,Kaliningrad Oblast,Northwest,"489,359",13.30%
41,Tula,Тула,Tula Oblast,Central,"475,161",-5.19%
42,Kursk,Курск,Kursk Oblast,Central,"452,976",9.11%
43,Stavropol,Ставрополь,Stavropol Krai,North Caucasus,"450,680",13.08%
44,Sochi,Сочи,Krasnodar Krai,South,"443,562",29.19%
45,Ulan-Ude,Улан-Удэ,Republic of Buryatia,Far East,"439,128",8.58%
46,Tver,Тверь,Tver Oblast,Central,"425,072",5.32%
47,Magnitogorsk,Магнитогорск,Chelyabinsk Oblast,Ural,"413,253",1.34%
48,Ivanovo,Иваново,Ivanovo Oblast,Central,"404,598",-0.91%
    """

        test_dict = {
            'shift-jis': shift_jis.encode('shift-jis'),
            'gb2312': gb2312.encode('gb2312'),
            'big5': big_5.encode('big5'),
            'EUC-KR': euc_kr.encode('EUC-KR'),
            'iso-8859-1': iso_8859.encode('iso-8859-1'),
            ### 'cp1252': cp1252.encode('cp1252')
        }

        for key in test_dict:
            enc_code = check_encoding(test_dict[key])
            assert_equal(
                key, enc_code,
                "compare target encoding %s with tested encoding %s" %
                (key, enc_code))