def import_versions_from_file(csv_filename, columns): """ Import the versions in the columns listed in `columns` :param columns: zero-based list of column numbers with a new version in them :return: """ csv.field_size_limit(sys.maxsize) with open(csv_filename, 'rb') as csvfile: reader = csv.reader(csvfile) rows = [row for row in reader] return _import_versions_from_csv(rows, columns)
def getCSVReader(datafile, delimiter): ''' @summary: Get a CSV reader that can handle encoding ''' f = None unicodecsv.field_size_limit(sys.maxsize) try: f = open(datafile, 'rb') reader = unicodecsv.reader(f, delimiter=delimiter, encoding=ENCODING) except Exception, e: raise Exception('Failed to read or open {}, ({})'.format( datafile, str(e)))
def load_csvgz_in_chunks(file_name, saveing_function): """Load data from a gzipped CSV and pass it to a save function in chunks.""" csv.field_size_limit(sys.maxsize) chunk_size = 3000 with gzip.open(file_name, "r") as f : stored_stream = [] reader = csv.DictReader(f) for num, row in enumerate(reader): stored_stream.append(row) if (num % chunk_size) == 0 and num != 0: saveing_function(stored_stream) stored_stream = [] return True
def getCSVWriter(datafile, delimiter, doAppend=True): ''' @summary: Get a CSV writer that can handle encoding ''' unicodecsv.field_size_limit(sys.maxsize) if doAppend: mode = 'ab' else: mode = 'wb' try: f = open(datafile, mode) writer = unicodecsv.writer(f, delimiter=delimiter, encoding=ENCODING) except Exception, e: raise Exception('Failed to read or open {}, ({})'.format( datafile, str(e)))
def export_version_csv(index, version_list): assert isinstance(index, AbstractIndex) assert isinstance(version_list, list) or isinstance( version_list, VersionSet) assert all(isinstance(v, Version) for v in version_list) csv.field_size_limit(sys.maxsize) output = io.BytesIO() writer = csv.writer(output) # write header data writer.writerow(["Index Title"] + [index.title for _ in version_list]) writer.writerow(["Version Title"] + [v.versionTitle for v in version_list]) writer.writerow(["Language"] + [v.language for v in version_list]) writer.writerow(["Version Source"] + [v.versionSource for v in version_list]) writer.writerow(["Version Notes"] + [getattr(v, "versionNotes", "") for v in version_list]) section_refs = index.all_section_refs() for section_ref in section_refs: segment_refs = section_ref.all_subrefs() seg_vers = {} # set blank array for version data for ref in segment_refs: seg_vers[ref.normal()] = [] # populate each version for version in version_list: section = section_ref.text(vtitle=version.versionTitle, lang=version.language).text for ref in segment_refs: if ref.sections[-1] > len(section): seg_vers[ref.normal()] += [""] else: seg_vers[ref.normal()] += [section[ref.sections[-1] - 1]] # write lines for each section for ref in segment_refs: writer.writerow([ref.normal()] + seg_vers[ref.normal()]) return output.getvalue()
def file_reader(path): colDict = dict() row = None index = None maxInt = sys.maxsize decrement = True while decrement: # decrease the maxInt value by factor 10 # as long as the OverflowError occurs. decrement = False try: unicodecsv.field_size_limit(maxInt) colList = list() with open(path, 'r') as fr: data = unicodecsv.reader(fr, delimiter=C.DELIMITER, encoding=C.ENCODING, errors='replace') # print C.ENCODING for header in data.next(): colList.append({header: list()}) for index, row in enumerate(data): # print row for i, field in enumerate(row): colList[i][colList[i].keys()[0]].append( field[1:].strip()) colDict = {k: v for d in colList for k, v in d.items()} except OverflowError: maxInt = int(maxInt / 10) decrement = True except IndexError: # print row index = index + 2 print str(D.datetime.now()) + ">>" + path.split( '\\')[-1] + ">>File is not proper>>Row number:" + str(index) except Exception as e: print str( D.datetime.now()) + ">>" + path.split('\\')[-1] + ">>" + str(e) return colDict
def export_version_csv(index, version_list): assert isinstance(index, AbstractIndex) assert isinstance(version_list, list) or isinstance(version_list, VersionSet) assert all(isinstance(v, Version) for v in version_list) csv.field_size_limit(sys.maxsize) output = io.BytesIO() writer = csv.writer(output) # write header data writer.writerow(["Index Title"] + [index.title for _ in version_list]) writer.writerow(["Version Title"] + [v.versionTitle for v in version_list]) writer.writerow(["Language"] + [v.language for v in version_list]) writer.writerow(["Version Source"] + [v.versionSource for v in version_list]) writer.writerow(["Version Notes"] + [getattr(v, "versionNotes", "") for v in version_list]) section_refs = index.all_section_refs() for section_ref in section_refs: segment_refs = section_ref.all_subrefs() seg_vers = {} # set blank array for version data for ref in segment_refs: seg_vers[ref.normal()] = [] # populate each version for version in version_list: section = section_ref.text(vtitle=version.versionTitle, lang=version.language).text for ref in segment_refs: if ref.sections[-1] > len(section): seg_vers[ref.normal()] += [""] else: seg_vers[ref.normal()] += [section[ref.sections[-1] - 1]] # write lines for each section for ref in segment_refs: writer.writerow([ref.normal()] + seg_vers[ref.normal()]) return output.getvalue()
def export_merged_csv(index, lang=None): assert isinstance(index, Index) assert lang in ["en", "he"] csv.field_size_limit(sys.maxsize) output = io.BytesIO() writer = csv.writer(output) # write header data writer.writerow(["Index Title"] + [index.title]) writer.writerow(["Version Title"] + ["merged"]) writer.writerow(["Language"] + [lang]) writer.writerow(["Version Source"] + ["-"]) writer.writerow(["Version Notes"] + ["-"]) section_refs = index.all_section_refs() for section_ref in section_refs: segment_refs = section_ref.all_subrefs() seg_vers = {} # set blank array for version data for ref in segment_refs: seg_vers[ref.normal()] = [] # populate each version section = section_ref.text(lang=lang, exclude_copyrighted=True).text for ref in segment_refs: if ref.sections[-1] > len(section): seg_vers[ref.normal()] += [""] else: seg_vers[ref.normal()] += [section[ref.sections[-1] - 1]] # write lines for each section for ref in segment_refs: writer.writerow([ref.normal()] + seg_vers[ref.normal()]) return output.getvalue()
import sys from io import BytesIO import six import unicodecsv from rows.plugins.utils import ( create_table, get_filename_and_fobj, ipartition, serialize, ) sniffer = unicodecsv.Sniffer() unicodecsv.field_size_limit(sys.maxsize) def fix_dialect(dialect): if not dialect.doublequote and dialect.escapechar is None: dialect.doublequote = True if dialect.quoting == unicodecsv.QUOTE_MINIMAL and dialect.quotechar == "'": # Python csv's Sniffer seems to detect a wrong quotechar when # quoting is minimal dialect.quotechar = '"' if six.PY2: def discover_dialect(sample, encoding=None, delimiters=(b",", b";", b"\t", b"|")):
# # python AddStudentCleaning.py Albany NY AlbanyNY_ForStudentsV1_ashley.dta 1 1930 # # Note: Student file MUST BE ON RHEA SERVER in the "studentcleaned" directory # # ex: "/LatestCities/1930/studentcleaned" import os, sys, subprocess import unicodecsv as csv import pandas as pd import numpy as np import re import pickle import fuzzyset csv.field_size_limit(sys.maxsize) # These capture information from the command prompt c = sys.argv[1] s = sys.argv[2] student_file = sys.argv[3] version = sys.argv[4] year = sys.argv[5] #c = "St Louis" #s = "MO" #student_file = "StLouisMO_ForStudentsV4_rush.dta" #version = 6 #year = 1930 c_spaces = c
def load_authority_file(cursor, path_to_authority_files, filename, auth_file_to_entity_concept_mapping): print filename.upper() start = time() value_types = models.ValueTypes.objects.all() filepath = os.path.join(path_to_authority_files, filename) unicodecsv.field_size_limit(sys.maxint) errors = [] lookups = Lookups() #create nodes for each authority document file and relate them to the authority document node in the concept schema auth_doc_file_name = str(filename) display_file_name = string.capwords(auth_doc_file_name.replace('_',' ').replace('AUTHORITY DOCUMENT.csv', '').strip()) if auth_doc_file_name.upper() != 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.CSV': top_concept = Concept() top_concept.id = str(uuid.uuid4()) top_concept.nodetype = 'Concept' top_concept.legacyoid = auth_doc_file_name top_concept.addvalue({'value':display_file_name, 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label'}) lookups.add_relationship(source='00000000-0000-0000-0000-000000000001', type='hasTopConcept', target=top_concept.id) else: top_concept = Concept().get(id = '00000000-0000-0000-0000-000000000005') top_concept.legacyoid = 'ARCHES RESOURCE CROSS-REFERENCE RELATIONSHIP TYPES.E32.csv' lookups.add_lookup(concept=top_concept, rownum=0) try: with open(filepath, 'rU') as f: rows = unicodecsv.DictReader(f, fieldnames=['CONCEPTID','PREFLABEL','ALTLABELS','PARENTCONCEPTID','CONCEPTTYPE','PROVIDER'], encoding='utf-8-sig', delimiter=',', restkey='ADDITIONAL', restval='MISSING') rows.next() # skip header row for row in rows: try: if 'MISSING' in row: raise Exception('The row wasn\'t parsed properly. Missing %s' % (row['MISSING'])) else: legacyoid = row[u'CONCEPTID'] concept = Concept() concept.id = legacyoid if is_uuid(legacyoid) == True else str(uuid.uuid4()) concept.nodetype = 'Concept'# if row[u'CONCEPTTYPE'].upper() == 'INDEX' else 'Collection' concept.legacyoid = row[u'CONCEPTID'] concept.addvalue({'value':row[u'PREFLABEL'], 'language': settings.LANGUAGE_CODE, 'type': 'prefLabel', 'category': 'label'}) if row['CONCEPTTYPE'].lower() == 'collector': concept.addvalue({'value':row[u'PREFLABEL'], 'language': settings.LANGUAGE_CODE, 'type': 'collector', 'category': 'label'}) if row[u'ALTLABELS'] != '': altlabel_list = row[u'ALTLABELS'].split(';') for altlabel in altlabel_list: concept.addvalue({'value':altlabel, 'language': settings.LANGUAGE_CODE, 'type': 'altLabel', 'category': 'label'}) parent_concept_id = lookups.get_lookup(legacyoid=row[u'PARENTCONCEPTID']).id lookups.add_relationship(source=parent_concept_id, type='narrower', target=concept.id, rownum=rows.line_num) # don't add a member relationship between a top concept and it's children if parent_concept_id != top_concept.id: lookups.add_relationship(source=parent_concept_id, type='member', target=concept.id, rownum=rows.line_num) # add the member relationship from the E55 type (typically) to their top members if auth_doc_file_name in auth_file_to_entity_concept_mapping and row[u'PARENTCONCEPTID'] == auth_doc_file_name: for entitytype_info in auth_file_to_entity_concept_mapping[auth_doc_file_name]: lookups.add_relationship(source=entitytype_info['ENTITYTYPE_CONCEPTID'], type='member', target=concept.id, rownum=rows.line_num) if row[u'PARENTCONCEPTID'] == '' or (row[u'CONCEPTTYPE'].upper() != 'INDEX' and row[u'CONCEPTTYPE'].upper() != 'COLLECTOR'): raise Exception('The row has invalid values.') lookups.add_lookup(concept=concept, rownum=rows.line_num) except Exception as e: errors.append('ERROR in row %s: %s' % (rows.line_num, str(e))) except UnicodeDecodeError as e: errors.append('ERROR: Make sure the file is saved with UTF-8 encoding\n%s\n%s' % (str(e), traceback.format_exc())) except Exception as e: errors.append('ERROR: %s\n%s' % (str(e), traceback.format_exc())) if len(errors) > 0: errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename)) errors.append('\n\n\n\n') try: # try and open the values file if it exists if exists(filepath.replace('.csv', '.values.csv')): with open(filepath.replace('.csv', '.values.csv'), 'rU') as f: rows = unicodecsv.DictReader(f, fieldnames=['CONCEPTID','VALUE','VALUETYPE','PROVIDER'], encoding='utf-8-sig', delimiter=',', restkey='ADDITIONAL', restval='MISSING') rows.next() # skip header row for row in rows: try: if 'ADDITIONAL' in row: raise Exception('The row wasn\'t parsed properly. Additional fields found %s. Add quotes to values that have commas in them.' % (row['ADDITIONAL'])) else: row_valuetype = row[u'VALUETYPE'].strip() if row_valuetype not in value_types.values_list('valuetype', flat=True): valuetype = models.ValueTypes() valuetype.valuetype = row_valuetype valuetype.category = 'undefined' valuetype.namespace = 'arches' valuetype.save() value_types = models.ValueTypes.objects.all() concept = lookups.get_lookup(legacyoid=row[u'CONCEPTID']) category = value_types.get(valuetype=row_valuetype).category concept.addvalue({'value':row[u'VALUE'], 'type': row[u'VALUETYPE'], 'category': category}) except Exception as e: errors.append('ERROR in row %s (%s): %s' % (rows.line_num, str(e), row)) except UnicodeDecodeError as e: errors.append('ERROR: Make sure the file is saved with UTF-8 encoding\n%s\n%s' % (str(e), traceback.format_exc())) except Exception as e: errors.append('ERROR: %s\n%s' % (str(e), traceback.format_exc())) if len(errors) > 0: errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename.replace('.csv', '.values.csv'))) errors.append('\n\n\n\n') # insert and index the concpets for key in lookups.lookup: try: lookups.lookup[key]['concept'].save() except Exception as e: errors.append('ERROR in row %s (%s):\n%s\n' % (lookups.lookup[key]['rownum'], str(e), traceback.format_exc())) lookups.lookup[key]['concept'].index(scheme=top_concept) # insert the concept relations for relation in lookups.concept_relationships: sql = """ INSERT INTO concepts.relations(conceptidfrom, conceptidto, relationtype) VALUES ('%s', '%s', '%s'); """%(relation['source'], relation['target'], relation['type']) #print sql try: cursor.execute(sql) except Exception as e: errors.append('ERROR in row %s (%s):\n%s\n' % (relation['rownum'], str(e), traceback.format_exc())) if len(errors) > 0: errors.insert(0, 'ERRORS IN FILE: %s\n' % (filename)) errors.append('\n\n\n\n') #print 'Time to parse = %s' % ("{0:.2f}".format(time() - start)) return errors
from flask import stream_with_context from werkzeug.exceptions import BadRequest from itsdangerous import URLSafeSerializer from model import Identifier, Publication import os import util from collections import namedtuple, OrderedDict import datetime # csv modul ma nastaveny limit 128KB na field, co je pre nase ucely malo # nastavme ho na 8MB nech to nehadze errory import unicodecsv unicodecsv.field_size_limit(8388608) if 'CITACIE_DEBUG' in os.environ: app.debug = True from local_settings import active_config config = active_config(app) serializer = URLSafeSerializer(config.secret) import titlecase def filter_titlecase(text, all_caps_only=False): if all_caps_only and not titlecase.ALL_CAPS.match(text): return text return titlecase.titlecase(text)
#!/usr/bin/python # coding=utf-8 from __future__ import print_function import argparse import os import sys import unicodecsv as csv import unicodedata import codecs import chardet import re csv.field_size_limit(sys.maxsize) # from graph_tool.all import * # Parse command-line arguments ########################################### def options(): """Parse command line options. Args: Returns: argparse object. Raises: IOError: if input file does not exist. """ parser = argparse.ArgumentParser(description='Create a coauthorship network from publications downloaded '
if __name__ == '__main__': # maxInt = sys.maxsize # while True: # # decrease the maxInt value by factor 10 # # as long as the OverflowError occurs. # try: # csv.field_size_limit(maxInt) # break # except OverflowError: # maxInt = int(maxInt/10) import ctypes csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) transformer = NLXTransformer(s3_prefix='open-skills-private/NLX_extracted', temp_file_path='/mnt/sqltransfer') logging.basicConfig(level=logging.INFO) #logging.info('max csv size set to {}'.format(maxInt)) #for year in ('2003', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2016', '2017', '2018'): #for year in ('2003', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013'): #for year in ('2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013'): for year in range(2003, 2020): year = str(year) #stats_counter = DatasetStatsCounter( # quarter=year, # dataset_id='NLX' #)
# -*- coding: utf-8 -*- import sys from collections import namedtuple import unicodecsv from decimal import Decimal from datetime import datetime from progressbar import ProgressBar import progressbar.widgets from coaster.utils import getbool from hascore import init_for from hascore.models import db, GeoName, GeoCountryInfo, GeoAltName, GeoAdmin1Code, GeoAdmin2Code unicodecsv.field_size_limit(sys.maxint) CountryInfoRecord = namedtuple('CountryInfoRecord', [ 'iso_alpha2', 'iso_alpha3', 'iso_numeric', 'fips_code', 'title', 'capital', 'area_in_sqkm', 'population', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'geonameid', 'neighbours', 'equivalent_fips_code' ]) GeoNameRecord = namedtuple('GeoNameRecord', [ 'geonameid', 'title', 'ascii_title', 'alternatenames', 'latitude', 'longitude', 'fclass', 'fcode', 'country_id', 'cc2', 'admin1', 'admin2', 'admin3', 'admin4', 'population', 'elevation', 'dem', 'timezone', 'moddate' ]) GeoAdminRecord = namedtuple('GeoAdminRecord', ['code', 'title', 'ascii_title', 'geonameid'])
from collections import namedtuple from urlparse import urljoin import zipfile import unicodecsv import requests from decimal import Decimal from datetime import datetime from progressbar import ProgressBar import progressbar.widgets from coaster.utils import getbool from hascore import init_for from hascore.models import db, GeoName, GeoCountryInfo, GeoAltName, GeoAdmin1Code, GeoAdmin2Code unicodecsv.field_size_limit(sys.maxint) CountryInfoRecord = namedtuple('CountryInfoRecord', ['iso_alpha2', 'iso_alpha3', 'iso_numeric', 'fips_code', 'title', 'capital', 'area_in_sqkm', 'population', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'geonameid', 'neighbours', 'equivalent_fips_code']) GeoNameRecord = namedtuple('GeoNameRecord', ['geonameid', 'title', 'ascii_title', 'alternatenames', 'latitude', 'longitude', 'fclass', 'fcode', 'country_id', 'cc2', 'admin1', 'admin2', 'admin3', 'admin4', 'population', 'elevation', 'dem', 'timezone', 'moddate']) GeoAdminRecord = namedtuple('GeoAdminRecord', ['code', 'title', 'ascii_title', 'geonameid'])
import sys try: import ujson as json except ImportError: import json # noqa: F401 try: import parquet except ImportError: parquet = False if sys.version_info.major == 2: import unicodecsv as csv else: import csv csv.field_size_limit(1000000000)
try: import unicodecsv as csv except ImportError: import csv import zmq from zmq.eventloop import ioloop from zmq.eventloop.zmqstream import ZMQStream from cs.eyrie.config import SOCKET_TYPES from cs.eyrie.config import ZMQChannel csv.field_size_limit(sys.maxint) class Vassal(object): channels = { 'control': ZMQChannel( endpoint='ipc:///tmp/eyrie_herald', socket_type=zmq.SUB, ), } title = '(eyrie:vassal)' app_name = 'eyrie' args = None cursor_factory = DictCursor def __init__(self, **kwargs):
def import_versions_from_stream(csv_stream, columns, user_id): csv.field_size_limit(sys.maxsize) reader = csv.reader(csv_stream) rows = [row for row in reader] return _import_versions_from_csv(rows, columns, user_id)
import os import sys from django.core.management.base import BaseCommand from hits.models import Hit, HitTemplate from unicodecsv import reader as UnicodeReader #from util.unicodecsv import UnicodeReader #from csv import reader as UnicodeReader # The default field size limit is 131072 characters import unicodecsv unicodecsv.field_size_limit(sys.maxsize) def get_or_create_template_from_html_file(htmlfile, template_file_path): template_file_path = os.path.abspath(template_file_path) name = template_file_path form = htmlfile.read().decode('utf-8') template, created = HitTemplate.objects.get_or_create( name=name, defaults={'form': form}, ) if created: template.save() return template def parse_csv_file(fh): rows = UnicodeReader(fh)
import unicodecsv from calculadora_do_cidadao.rows.plugins.utils import ( create_table, get_filename_and_fobj, ipartition, serialize, ) sniffer = unicodecsv.Sniffer() # Some CSV files have more than 128kB of data in a cell, so we force this value # to be greater (16MB). # TODO: check if it impacts in memory usage. # TODO: may add option to change it by passing a parameter to import/export. unicodecsv.field_size_limit(16777216) def fix_dialect(dialect): if not dialect.doublequote and dialect.escapechar is None: dialect.doublequote = True if dialect.quoting == unicodecsv.QUOTE_MINIMAL and dialect.quotechar == "'": # Python csv's Sniffer seems to detect a wrong quotechar when # quoting is minimal dialect.quotechar = '"' def discover_dialect(sample, encoding, delimiters=(",", ";", "\t", "|")): """Discover a CSV dialect based on a sample size.