def sniff(self,
              file_upload: FileUpload,
              encoding: str = settings.DEFAULT_CHARSET,
              limit: int = 5) -> SniffResult:

        try:
            with file_upload.open() as csv_file:
                has_header = unicodecsv.Sniffer().has_header(
                    csv_file.read(1024).decode(encoding))
                csv_file.seek(0)
                dialect = unicodecsv.Sniffer().sniff(
                    csv_file.read(1024).decode(encoding))
                csv_format_opts = dict(dialect=dialect, )
                csv_file.seek(0)

                reader = unicodecsv.reader(csv_file, **csv_format_opts)
                if has_header:
                    header = next(reader)
                else:
                    header = None

                rows = list(islice(reader, max(0, limit))) if limit > 0 else []
        except (UnicodeDecodeError, unicodecsv.Error) as e:
            raise ParsingException(str(e)) from e

        contact_serializer = self.get_contact_serializer(data={})
        fields = {
            name: field
            for name, field in contact_serializer.get_fields().items()
            if not field.read_only
        }

        headers_mapping = {}
        if header:
            for num, name in enumerate(header):
                field_names = difflib.get_close_matches(name,
                                                        fields.keys(),
                                                        n=1)
                if field_names:
                    fields_name = field_names[0]
                    headers_mapping[fields_name] = num

        return SniffResult(
            dict(
                has_header=has_header,
                delimiter=dialect.delimiter,
            ),
            list(fields.keys()),
            rows,
            headers_mapping,
        )
Ejemplo n.º 2
0
def load_from_csv(filename):
    waypoints = []
    _dirname, _name = os.path.split(filename)
    _fs = open_fs(_dirname)
    with _fs.open(_name, "rb") as in_file:
        lines = in_file.readlines()
    if len(lines) < 4:
        raise SyntaxError("CSV file requires at least 4 lines!")
    dialect = csv.Sniffer().sniff(lines[-1].decode("utf-8"))
    csv_reader = csv.reader(lines, encoding="utf-8", dialect=dialect)
    name = next(csv_reader)[0]
    next(csv_reader)  # header
    for row in csv_reader:
        wp = ft.Waypoint()
        wp.location = row[1]
        wp.lat = float(row[2])
        wp.lon = float(row[3])
        wp.flightlevel = float(row[4])
        wp.pressure = float(row[5]) * 100.
        wp.distance_to_prev = float(row[6])
        wp.distance_total = float(row[7])
        wp.comments = row[8]
        waypoints.append(wp)
    name = os.path.basename(filename.replace(".csv", "").strip())
    return name, waypoints
Ejemplo n.º 3
0
def validate_delimiter(delimiter, codebook):
    """
    Validate the selected delimiter matches the sniffed delimeter

    :delimiter: delimiter as selected from the UI
    :codebook: open file object for reading

    :return: boolean of delimiter mismatch and errors list
    """
    errors = []
    delimiter_mismatch = False
    codebook.readline()
    dialect = csv.Sniffer().sniff(codebook.readline())
    sniffed_delimiter = dialect.delimiter

    if delimiter != sniffed_delimiter:
        error = {
            'errors': u"Selected delimiter doesn't match file delimiter",
            'schema_name': 'N/A',
            'schema_title': 'N/A',
            'name': 'N/A',
            'title': 'N/A'
        }

        delimiter_mismatch = True
        errors.append(error)

    codebook.seek(0)

    return delimiter_mismatch, errors
Ejemplo n.º 4
0
def grab_that(station):
    """
    A method that extracts climate data from CSV and converts it to a
    dictionary object.
    """
    with codecs.open(
            station,
            'rb',
    ) as f:
        # Tries to figure out CSV formatting to address encoding issues.
        dialect = csv.Sniffer().sniff(f.read(1024))
        f.seek(0)
        lines = csv.reader(f, dialect)
        for i in range(16):  # Skips the metadata
            next(lines)

        names, datum = [], {}
        for column in lines:
            for name in column:
                names.append(name)
                datum[name] = []
            break

        reader = csv.DictReader(f,
                                fieldnames=names,
                                delimiter=',',
                                quotechar='"')
        for row in reader:
            for column, value in row.iteritems():
                value = convert(value)
                datum.setdefault(column, []).append(value)
    return datum
Ejemplo n.º 5
0
def load_dataset_csv(filename):
    info = open(filename, "rb")
    has_header = unicodecsv.Sniffer().has_header(info.read(1024))
    info.seek(0)
    incsv = csv.reader(info)
    if has_header:
        next(incsv)  #Skip header
    dataset = list(incsv)
    return dataset
Ejemplo n.º 6
0
    def next_source_row(self, handle):
        """
        Given a file handle, return the next row of data as a key value dict.

        Return None to denote the EOF
        Return False to skip this row of data entirely
        """

        if not getattr(self, "detected_dialect", None):
            # Sniff for the dialect of the CSV file

            pos = handle.tell()
            handle.seek(0)
            readahead = handle.read(1024)
            handle.seek(pos)

            try:
                dialect = csv.Sniffer().sniff(readahead, ",")
            except csv.Error:
                # Fallback to excel format
                pass

            dialect = csv.excel

            dialect_attrs = [
                "delimiter",
                "doublequote",
                "escapechar",
                "lineterminator",
                "quotechar",
                "quoting",
                "skipinitialspace"
            ]

            self.detected_dialect = {x: getattr(dialect, x) for x in dialect_attrs}

        if not getattr(self, "reader", None):
            self.reader = csv.reader(handle, **self.detected_dialect)

        if not getattr(self, "detected_columns", None):
            # On first iteration, the line will be the column headings,
            # store those and return False to skip processing
            columns = self.reader.next()
            self.detected_columns = columns
            return False

        cols = self.detected_columns

        try:
            values = self.reader.next()
        except StopIteration:
            return None

        if not values:
            return None

        return dict(zip(cols, values))
Ejemplo n.º 7
0
def main():
    dialect = csv.Sniffer().sniff(EJEMPLO)
    reader = csv.reader(open(sys.argv[1]), dialect=dialect)
    writer = csv.DictWriter(open('productos.csv', 'w'),
                            fieldnames=PRODUCTO_COLS)
    writer.writeheader()
    bar = Bar('Normalizando CSV', suffix='%(percent)d%%')
    for l in bar.iter(reader):
        data = normalizar(dict(zip(headers, l)))
        writer.writerow(data)
Ejemplo n.º 8
0
def csv_data(csv_path, skip_header=True):
    """Pass in the path to a CSV file, returns a CSV Reader object.
    """
    csv_file = open(csv_path, 'r')
    # Determine the CSV dialect.
    dialect = unicodecsv.Sniffer().sniff(csv_file.read(1024))
    csv_file.seek(0)
    data = unicodecsv.reader(csv_file, dialect)
    if skip_header:
        data.next()
    return data
Ejemplo n.º 9
0
 def fectxt2pivot(self, fileobj):
     fieldnames = [
         'journal',        # JournalCode
         False,            # JournalLib
         False,            # EcritureNum
         'date',           # EcritureDate
         'account',        # CompteNum
         False,            # CompteLib
         'partner_ref',    # CompAuxNum
         False,            # CompAuxLib
         'ref',            # PieceRef
         False,            # PieceDate
         'name',           # EcritureLib
         'debit',          # Debit
         'credit',         # Credit
         'reconcile_ref',  # EcritureLet
         False,            # DateLet
         False,            # ValidDate
         False,            # Montantdevise
         False,            # Idevise
         ]
     first_line = fileobj.readline().decode()
     dialect = unicodecsv.Sniffer().sniff(first_line, delimiters="|\t")
     fileobj.seek(0)
     reader = unicodecsv.DictReader(
         fileobj,
         fieldnames=fieldnames,
         delimiter=dialect.delimiter,
         encoding=self.file_encoding)
     res = []
     i = 0
     for l in reader:
         i += 1
         # Skip header line
         if i == 1:
             continue
         l['credit'] = l['credit'] or '0'
         l['debit'] = l['debit'] or '0'
         vals = {
             'journal': l['journal'],
             'account': l['account'],
             'partner': l['partner_ref'],
             'credit': float(l['credit'].replace(',', '.')),
             'debit': float(l['debit'].replace(',', '.')),
             'date': datetime.strptime(l['date'], '%Y%m%d'),
             'name': l['name'],
             'ref': l['ref'],
             'reconcile_ref': l['reconcile_ref'],
             'line': i,
         }
         res.append(vals)
     return res
Ejemplo n.º 10
0
def _csv_data_from_file(csv_file, preview_limit=10):

    try:
        dialect = unicodecsv.Sniffer().sniff(csv_file.read(1024))
        csv_file.seek(0)
        csv_reader = unicodecsv.reader(csv_file, dialect)
        csv_values = itertools.islice(csv_reader, preview_limit)
        csv_values = zip(*csv_values)
        return {'success': True, 'data': csv_values}
    except unicodecsv.Error as exc:
        return {'success': False, 'error': exc.message}
    except UnicodeDecodeError as exc:
        return {'success': False, 'error': exc}
Ejemplo n.º 11
0
    def genericcsv2pivot(self, fileobj):
        # Prisme
        fieldnames = [
            'date', 'journal', 'account', 'partner',
            'analytic', 'name', 'debit', 'credit',
            'ref', 'reconcile_ref'
            ]
        first_line = fileobj.readline().decode()
        dialect = unicodecsv.Sniffer().sniff(first_line)
        fileobj.seek(0)
        reader = unicodecsv.DictReader(
            fileobj,
            fieldnames=fieldnames,
            delimiter=dialect.delimiter,
            quotechar='"',
            quoting=unicodecsv.QUOTE_MINIMAL,
            encoding='utf-8')
        res = []
        i = 0
        for l in reader:
            i += 1
            if i == 1 and self.file_with_header:
                continue
            date_str = l['date']
            try:
                date = datetime.strptime(date_str, self.date_format)
            except Exception:
                raise UserError(
                    (_("time data : '%s' in line %s does not match format '%s")
                     ) % (date_str, i, self.date_format))

            vals = {
                'journal': l['journal'],
                'account': l['account'],
                'credit': float(l['credit'].replace(',', '.') or 0),
                'debit': float(l['debit'].replace(',', '.') or 0),
                'date': date,
                'name': l['name'],
                'ref': l.get('ref', ''),
                'reconcile_ref': l.get('reconcile_ref', ''),
                'line': i,
                }
            if l['analytic']:
                vals['analytic'] = l['analytic']
            if l['partner']:
                vals['partner'] = l['partner']
            res.append(vals)
        return res
Ejemplo n.º 12
0
def commit_to_csv(commit, csv_filename):
    """
        Get a CSV generator from a git commit.
    """
    repo.git_dir
    data = (commit.tree / csv_filename).data_stream.read()
    dialect = csv.Sniffer().sniff(StringIO(unicode(data)).read(1024))
    data = data.splitlines()
    for n, row in enumerate(data):
        if n == 0:
            data[n] = "ID" + dialect.delimiter + row
        else:
            data[n] = str(n) + dialect.delimiter + row
    data = "\n".join(data)
    csv_out = csv.DictReader(StringIO(unicode(data), newline=None),
                             dialect=dialect)
    return csv_out
Ejemplo n.º 13
0
def validate_delimiter(delimiter, codebook):
    """
    Validate the selected delimiter matches the sniffed delimeter

    :delimiter: delimiter as selected from the UI
    :codebook: open file object for reading

    :return: boolean of delimiter mismatch and errors list
    """
    errors = []
    delimiter_mismatch = False
    codebook.readline()

    row = codebook.readline()
    try:
        dialect = csv.Sniffer().sniff(row)
    except Exception as e:
        error = {
            'errors': '{} - Row: {}'.format(e.message, row),
            'schema_name': 'N/A',
            'schema_title': 'N/A',
            'schema_publish_date': 'N/A',
            'name': 'N/A',
            'title': 'N/A'
        }
    else:
        sniffed_delimiter = dialect.delimiter

        if delimiter != sniffed_delimiter:
            error = {
                'errors': u"Selected delimiter doesn't match file delimiter",
                'schema_name': 'N/A',
                'schema_title': 'N/A',
                'schema_publish_date': 'N/A',
                'name': 'N/A',
                'title': 'N/A'
            }

            delimiter_mismatch = True
            errors.append(error)

        codebook.seek(0)

    return delimiter_mismatch, errors
def place_that(name):
    """
    When given a filename will dump station location headers
    to console and return a dictionary with raw unicode keys
    and values for station name and location variables.
    """
    try:
        location = str(name)
        with codecs.open(location, 'rb') as f:
            dialect = csv.Sniffer().sniff(f.read(1024))
            f.seek(0)
            verifier = csv.reader(f, dialect)
            for count, row in enumerate(verifier):  # Read and format metadata
                if count > 6:
                    break
            f.seek(0)
            names = ('Station Name', 'Province', 'Latitude', 'Longitude',
                     'Elevation', 'Climate Identifier', 'WMO Identifier',
                     'TC Identifier')
            datum = {}
            for name in names:
                datum[name] = []

            for count, row in enumerate(verifier):
                if count == 0:  # Special handling to deal with UTF-8 BOM
                    key = 'Station Name'
                    field = convert(row[1])
                    datum[key] = field
                    continue
                try:
                    if row[0] in names:
                        key = convert(row[0])
                        field = convert(row[1])
                        datum[key] = field
                except Exception as e:
                    print e
                    continue

        return datum
    except ValueError:
        raise Exception("Invalid station CSV. \
            Verify that CSVs hold Environment Canada station data.")
        pass
Ejemplo n.º 15
0
 def detectDialect(self, filename, comment="#"):
     """
     detectDialect
     """
     dialect = None
     with open(filename, "rb") as stream:
         stream = self.skip_commented_or_empty_lines(stream, comment)
         n = 128
         detected = False
         while not detected:
             try:
                 dialect = csv.Sniffer().sniff(stream.read(n),
                                               delimiters=";,")
                 detected = True
             except Exception as ex:
                 #print ex,"n=",n
                 n = n * 2
         stream.seek(0)
     return dialect
Ejemplo n.º 16
0
def load_csvf(fpath, fieldnames, encoding):
    """
    :param unicode fpath:
    :param Optional[list[unicode]] fieldnames:
    :param unicode encoding:
    :rtype: List[dict]
    """
    with open(fpath, 'rb') as f:
        snippet = f.read(8192)
        f.seek(0)

        dialect = csv.Sniffer().sniff(
            snippet if PYTHON2 else snippet.decode(encoding))
        dialect.skipinitialspace = True
        return list(
            csv.DictReader(f,
                           fieldnames=fieldnames,
                           dialect=dialect,
                           encoding=encoding))
Ejemplo n.º 17
0
def writeToFile(row_list):
    if Path('result.csv').is_file():
        f = open("result.csv", "a")
        sniffer = csv.Sniffer()
        csv_dialect = sniffer.sniff(open("result.csv").readline())

        writer = csv.writer(f,
                            encoding='UTF-8',
                            quoting=csv.QUOTE_NONE,
                            escapechar='\\',
                            dialect=csv_dialect)
        writer.writerows(row_list)
        f.close()
    else:
        with open('result.csv', 'w') as file:
            writer = csv.writer(file,
                                encoding='UTF-8',
                                delimiter=',',
                                quoting=csv.QUOTE_NONE,
                                escapechar='\\')
            writer.writerows(row_list)
Ejemplo n.º 18
0
def writeToFile(row_list):
    file_name = 'percents.csv'
    if Path(file_name).is_file():
        f = open(file_name, "a")
        sniffer = csv.Sniffer()
        csv_dialect = sniffer.sniff(open(file_name).readline())

        writer = csv.writer(f,
                            encoding='UTF-8',
                            quoting=csv.QUOTE_NONE,
                            escapechar='|',
                            dialect=csv_dialect)
        writer.writerows(row_list)
        f.close()
    else:
        with open(file_name, 'w') as file:
            writer = csv.writer(file,
                                encoding='UTF-8',
                                delimiter=',',
                                quoting=csv.QUOTE_NONE,
                                escapechar='|')
            writer.writerows(row_list)
Ejemplo n.º 19
0
   def records(self, in_file):
      import unicodecsv as csv
      with open(in_file, 'rb') as csvfile:
         dialect = csv.Sniffer().sniff(csvfile.read(1024))
         csvfile.seek(0)

         reader = None
         if self.columns:
            reader = csv.DictReader(csvfile, fieldnames=self.columns, dialect=dialect)
         elif self.read_header:
            reader = csv.DictReader(csvfile, dialect=dialect)
         else:
            reader = csv.reader(csvfile, dialect=dialect)
            
         def convert(obj):
            if isinstance(obj, dict):
               return obj
            return dict( ("_c%s" % idx, v) for idx,v in enumerate(obj))

         for row in reader:
            if len(row) > 0:
               yield convert(row)
Ejemplo n.º 20
0
    def to_internal_value(self, data):
        super(JsonFileField, self).to_internal_value(data)

        if is_zipfile(data):
            with ZipFile(data) as zf:
                raw_data = zf.read(splitext(data.name)[0])
        else:
            data.seek(0)
            raw_data = data.read()

        try:
            data.json = json.loads(raw_data)
        except ValueError:
            try:
                data.json = json.loads(raw_data, encoding='cp1251')
            except ValueError:
                try:
                    lines = raw_data.splitlines()
                    dialect = csv.Sniffer().sniff(lines[0], [',', ';', '\t'])
                    data.json = [item for item in csv.DictReader(lines, dialect=dialect)]
                except (ValueError, csv.Error):
                    self.fail('json')
        return data
Ejemplo n.º 21
0
def get_csv(infile):

    sniff_range = 4096

    sniffer = csv.Sniffer()

    dialect = sniffer.sniff(infile.read(sniff_range), delimiters=DELIMITERS)

    infile.seek(0)

    # Sniff for header
    header = sniffer.has_header(infile.read(sniff_range))

    infile.seek(0)

    # get the csv reader
    reader = csv.reader(infile, dialect)

    firstrow = next(reader)

    colnames = []

    for i, h in enumerate(firstrow):

        if len(h) > 0 and header:

            colnames.append(h)

        else:

            colnames.append('COLUMN{}'.format(i + 1))

    if not header:

        infile.seek(0)

    return (reader, colnames)
Ejemplo n.º 22
0
def upload_randomization_json(context, request):
    """
    Handles RANDID file uploads.
    The file is expected to be a CSV with the following columns:
        * ARM
        * STRATA
        * BLOCKID
        * RANDID
    In addition, the CSV file must have the columns as the form
    it is using for randomization.
    """

    check_csrf_token(request)
    db_session = request.db_session

    if not context.is_randomized:
        # No form check required as its checked via database constraint
        raise HTTPBadRequest(body=_(u'This study is not randomized'))

    input_file = request.POST['upload'].file
    input_file.seek(0)

    # Ensure we can read the CSV
    try:
        csv.Sniffer().sniff(input_file.read(1024))
    except csv.Error:
        raise HTTPBadRequest(body=_(u'Invalid file-type, must be CSV'))
    else:
        input_file.seek(0)

    reader = csv.DictReader(input_file)

    # Case-insensitive lookup
    fieldnames = dict((name.upper(), name) for name in reader.fieldnames)
    stratumkeys = ['ARM', 'BLOCKID', 'RANDID']
    formkeys = context.randomization_schema.attributes.keys()

    # Ensure the CSV defines all required columns
    required = stratumkeys + formkeys
    missing = [name for name in required if name.upper() not in fieldnames]
    if missing:
        raise HTTPBadRequest(
            body=_(u'File upload is missing the following columns ${columns}',
                   mapping={'columns': ', '.join(missing)}))

    # We'll be using this to create new arms as needed
    arms = dict([(arm.name, arm) for arm in context.arms])

    # Default to comple state since they're generated by a statistician
    complete = (db_session.query(
        datastore.State).filter_by(name=u'complete').one())

    for row in reader:
        arm_name = row[fieldnames['ARM']]
        if arm_name not in arms:
            arms[arm_name] = models.Arm(study=context,
                                        name=arm_name,
                                        title=arm_name)

        stratum = models.Stratum(study=context,
                                 arm=arms[arm_name],
                                 block_number=int(row[fieldnames['BLOCKID']]),
                                 randid=row[fieldnames['RANDID']])

        if 'STRATA' in fieldnames:
            stratum.label = row[fieldnames['STRATA']]

        db_session.add(stratum)

        entity = datastore.Entity(schema=context.randomization_schema,
                                  state=complete)

        for key in formkeys:
            entity[key] = row[fieldnames[key.upper()]]

        stratum.entities.add(entity)

    try:
        db_session.flush()
    except sa.exc.IntegrityError as e:
        if 'uq_stratum_reference_number' in e.message:
            raise HTTPBadRequest(body=_(
                u'The submitted file contains existing reference numbers. '
                u'Please upload a file with new reference numbers.'))

    return HTTPOk()
Ejemplo n.º 23
0
from __future__ import unicode_literals

import sys
from io import BytesIO

import six
import unicodecsv

from rows.plugins.utils import (
    create_table,
    get_filename_and_fobj,
    ipartition,
    serialize,
)

sniffer = unicodecsv.Sniffer()
unicodecsv.field_size_limit(sys.maxsize)


def fix_dialect(dialect):
    if not dialect.doublequote and dialect.escapechar is None:
        dialect.doublequote = True

    if dialect.quoting == unicodecsv.QUOTE_MINIMAL and dialect.quotechar == "'":
        # Python csv's Sniffer seems to detect a wrong quotechar when
        # quoting is minimal
        dialect.quotechar = '"'


if six.PY2:
Ejemplo n.º 24
0
    def process(self, data, url_object):
        """Process the CSV, by executing rules and saving matches."""
        from ..scanner_types.scanner import Scanner
        scanner = Scanner.from_scan_id(url_object.scan.pk)
        # print "*** 1 ***"
        # If we don't have to do any annotation/replacement, treat it like a
        # normal text file for efficiency
        if not scanner.scan_object.output_spreadsheet_file:
            return self.text_processor.process(data, url_object)

        # Check if scan is limited to certain columns.
        columns = scanner.scan_object.columns
        columns = list(map(int, columns.split(','))) if columns else []

        # Try to detect the CSV Dialect using the first 1024 characters of
        # the data
        try:
            dialect = unicodecsv.Sniffer().sniff(data[:1024])
        except unicodecsv.Error:
            # Couldn't detect CSV Dialect, processing failed
            scanner.scan_object.log_occurrence("Could not detect CSV "
                                               "dialect. Could not perform "
                                               "annotation/replacement.")
            return False

        # Sniffer.sniff doesn't set escape character or quoting
        dialect.escapechar = '\\'
        dialect.quoting = unicodecsv.QUOTE_ALL

        # print "*** 2 ***"
        # Convert unicode dialect properties to str because csv.Reader
        # expects them to be
        dialect.delimiter = str(dialect.delimiter)
        dialect.quotechar = str(dialect.quotechar)
        dialect.doublequote = str(dialect.doublequote)
        dialect.escapechar = str(dialect.escapechar)
        dialect.lineterminator = str(dialect.lineterminator)
        dialect.skipinitialspace = str(dialect.skipinitialspace)

        rows = []

        # print "*** 3 ***"
        # Read CSV file
        reader = unicodecsv.reader(io.StringIO(data.encode('utf-8')), dialect)
        first_row = True
        header_row = []
        for row in reader:
            warnings_in_row = []
            if first_row:
                header_row = row
                # Append column header
                row.append("Matches")
                first_row = False
                rows.append(row)
                continue

            for i in range(len(row)):
                # If columns are specified, and present column is not listed,
                # skip.
                if columns and not i + 1 in columns:
                    continue
                # Execute rules on each cell
                matches = scanner.execute_rules(row[i])
                for match in matches:
                    # Save matches
                    match['url'] = url_object
                    match['scan'] = url_object.scan
                    match.save()

                    warnings_in_row.append((match['matched_rule'], i))

                    # Only replace HIGH sensitivity matches
                    if not match['sensitivity'] == Sensitivity.HIGH:
                        continue

                    if (scanner.scan_object.do_cpr_replace
                            and match['matched_rule'] == 'cpr'):
                        replacement = scanner.scan_object.cpr_replace_text
                    elif (scanner.scan_object.do_name_replace
                          and match['matched_rule'] == 'name'):
                        replacement = scanner.scan_object.name_replace_text
                    elif (scanner.scan_object.do_address_replace
                          and match['matched_rule'] == 'address'):
                        replacement = scanner.scan_object.address_replace_text
                    else:
                        replacement = None

                    # Replace matched text with replacement text dependent
                    # on rule matched if replacement is demanded
                    if replacement is not None:
                        # Some rules like CPR rule mask their matched_data,
                        # so the real matched text is in original_matched_data
                        try:
                            search_text = match['original_matched_data']
                        except KeyError:
                            search_text = match['matched_data']
                        row[i] = row[i].replace(search_text, replacement)

            # Add annotation cell indicating which rules were matched and in
            # which column
            annotation = ", ".join(
                "%s (%s)" % (Match.get_matched_rule_display_name(warning[0]),
                             header_row[warning[1]])
                for warning in warnings_in_row)
            row.append(annotation)
            rows.append(row)

        # print "*** 4 ***"
        # Write to output file
        with open(scanner.scan_object.scan_output_file, 'w') as f:
            writer = unicodecsv.writer(f,
                                       delimiter=';',
                                       quotechar='"',
                                       escapechar='|')
            writer.writerows(rows)
        # print "*** 5 ***"
        return True
Ejemplo n.º 25
0
    # return True on success and False on errors
    return pisaStatus.err

import_dir= sys.argv[1] + "/"
user_list = import_dir + "all-students.txt"
students  = 1
if not os.path.exists( user_list ):
    user_list=import_dir + "all-user.txt"
    students = 0
if not os.path.exists( import_dir + "/passwordfiles" ):
  os.mkdir( import_dir + "passwordfiles", 0770 );

all_classes = []
with open(user_list) as csvfile:
    #Detect the type of the csv file
    dialect = unicodecsv.Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    #Create an array of dicts from it
    unicodecsv.register_dialect('oss',dialect)
    reader = unicodecsv.DictReader(csvfile,dialect='oss')
    for row in reader:
        fobj = open("/usr/share/oss/templates/password.html","r")
        template = fobj.read().decode('utf8')
        fobj.close()
        uid=""
        group=""
        for field in reader.fieldnames:
            template = template.replace(field,escape(row[field]))
            if field == "UID" or field == "BENUTZERNAME" or field == "LOGIN":
                uid=row[field]
            if students == 1 and ( field == "CLASS" or field == "KLASSE" ):
Ejemplo n.º 26
0
    def analyse_stream(self, byte_file_obj, **kwargs):
        """
        Analyse a stream of bytes and interpret as csv file.

        may want to revert back to this commit if things break:
        https://github.com/derwentx/WooGenerator/commit/c4fabf83d5b4d1e0a4d3ff755cd8eadf1433d253

        Arguments:
        ----
            byte_file_obj (io.IOBase):
                The byte stream to be analysed
            limit (int):
                The number of items to process from the stream
            dialect_suggestion (unicodecsv.Dalect, basestring, optional):
                A suggestion for the dialect to process the csv file as
            encoding (basestring, optional):
                The encoding of the file stream. Defaults to utf8
            stream_name:
                Used to differentiate this stream from others in debugging.

        Raises:
        ----
            UserWarning:
                When analyse_stream called withoud clearing transient first
        """

        limit, dialect_suggestion, encoding, stream_name = \
            (kwargs.get('limit'), kwargs.get('dialect_suggestion'),
             kwargs.get('encoding'), kwargs.get('stream_name'))

        if hasattr(self, 'rowcount') and self.rowcount > 1:
            warn = UserWarning(
                'rowcount should be 0. Make sure clear_transients is being called on ancestors'
            )
            self.raise_exception(warn)
        if encoding is None:
            encoding = "utf8"

        if stream_name is None:
            if hasattr(byte_file_obj, 'name'):
                stream_name = byte_file_obj.name
            else:
                stream_name = 'stream'

        if self.DEBUG_PARSER:
            self.register_message(
                "Analysing stream: {0}, encoding: {1}".format(
                    stream_name, encoding))

        # I can't imagine this having any problems
        byte_sample = SanitationUtils.coerce_bytes(byte_file_obj.read(1000))
        byte_file_obj.seek(0)

        if self.DEBUG_PARSER:
            self.register_message("dialect_suggestion: %s" %
                                  dialect_suggestion)

        if dialect_suggestion:
            csvdialect = UnicodeCsvDialectUtils.get_dialect_from_suggestion(
                dialect_suggestion)
        else:
            csvdialect = unicodecsv.Sniffer().sniff(byte_sample)
            assert \
                csvdialect.delimiter == ',' and isinstance(
                    csvdialect.delimiter, str)

        if self.DEBUG_PARSER:
            self.register_message(
                UnicodeCsvDialectUtils.dialect_to_str(csvdialect))

        unicodecsvreader = unicodecsv.reader(byte_file_obj,
                                             dialect=csvdialect,
                                             encoding=encoding,
                                             strict=True)
        return self.analyse_rows(unicodecsvreader,
                                 file_name=stream_name,
                                 limit=limit)
Ejemplo n.º 27
0
def build_schema(infile, outfile, delimiter=None, quotechar='\"', encoding=None, dataset_name=None, base="https://iisg.amsterdam/"):
    """
    Build a CSVW schema based on the ``infile`` CSV file, and write the resulting JSON CSVW schema to ``outfile``.

    Takes various optional parameters for instructing the CSV reader, but is also quite good at guessing the right values.
    """

    url = os.path.basename(infile)
    # Get the current date and time (UTC)
    today = datetime.datetime.utcnow().strftime("%Y-%m-%d")

    if dataset_name is None:
        dataset_name = url

    if encoding is None:
        detector = UniversalDetector()
        with open(infile, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()
        encoding = detector.result['encoding']
        logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'],
                                                                   detector.result['confidence']))

    if delimiter is None:
        try: #Python 3
            with open(infile, 'r', errors='ignore') as csvfile:
                # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t")
                dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter
                csvfile.seek(0)
        except TypeError: #Python 2
            with open(infile, 'r') as csvfile:
                # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t")
                dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter
                csvfile.seek(0)
        logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter))
        delimiter = dialect.delimiter


    logger.info("Delimiter is: {}".format(delimiter))

    if base.endswith('/'):
        base = base[:-1]

    metadata = {
        u"@id": iribaker.to_iri(u"{}/{}".format(base, url)),
        u"@context": [u"https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json",
                     {u"@language": u"en",
                      u"@base": u"{}/".format(base)},
                     get_namespaces(base)],
        u"url": url,
        u"dialect": {u"delimiter": delimiter,
                    u"encoding": encoding,
                    u"quoteChar": quotechar
                    },
        u"dc:title": dataset_name,
        u"dcat:keyword": [],
        u"dc:publisher": {
            u"schema:name": u"CLARIAH Structured Data Hub - Datalegend",
            u"schema:url": {u"@id": u"http://datalegend.net"}
        },
        u"dc:license": {u"@id": u"http://opendefinition.org/licenses/cc-by/"},
        u"dc:modified": {u"@value": today, u"@type": u"xsd:date"},
        u"tableSchema": {
            u"columns": [],
            u"primaryKey": None,
            u"aboutUrl": u"{_row}"
        }
    }

    with io.open(infile, 'rb') as infile_file:
        r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar)

        try:
            # Python 2
            header = r.next()
        except AttributeError:
            # Python 3
            header = next(r)

        logger.info(u"Found headers: {}".format(header))

        if u'' in header:
            logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse")
        if len(set(header)) < len(header):
            logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse")

        # First column is primary key
        metadata[u'tableSchema'][u'primaryKey'] = header[0]

        for head in header:
            col = {
                u"@id": iribaker.to_iri(u"{}/{}/column/{}".format(base, url, head)),
                u"name": head,
                u"titles": [head],
                u"dc:description": head,
                u"datatype": u"string"
            }

            metadata[u'tableSchema'][u'columns'].append(col)

    with open(outfile, 'w') as outfile_file:
        outfile_file.write(json.dumps(metadata, indent=True))

    logger.info("Done")
    return
Ejemplo n.º 28
0
def csv_file_dialect(fullpath):
    """Detect the dialect of a CSV or TXT data file.
    parameters:
        fullpath - full path to the file to process (required)
    returns:
        dialect - a csv.dialect object with the detected attributes
    """
    if fullpath is None or len(fullpath) == 0:
        logging.debug('No file given in csv_file_dialect().')
        return False

    # Cannot function without an actual file where full path points
    if os.path.isfile(fullpath) == False:
        logging.debug('File %s not found in csv_file_dialect().' % fullpath)
        return None

    # Let's look at up to readto bytes from the file
    readto = 4096
    filesize = os.path.getsize(fullpath)

    if filesize < readto:
        readto = filesize

    with open(fullpath, 'rb') as file:
        # Try to read the specified part of the file
        try:
            buf = file.read(readto)
            s = 'csv_file_dialect()'
            s += ' buf:\n%s' % buf
            logging.debug(s)
            # Make a determination based on existence of tabs in the buffer, as the
            # Sniffer is not particularly good at detecting TSV file formats. So, if the
            # buffer has a tab in it, let's treat it as a TSV file 
            if buf.find('\t')>0:
                return tsv_dialect()
#            dialect = csv.Sniffer().sniff(file.read(readto))
            # Otherwise let's see what we can find invoking the Sniffer.
            dialect = csv.Sniffer().sniff(buf)
        except csv.Error:
            # Something went wrong, so let's try to read a few lines from the beginning of 
            # the file
            try:
                file.seek(0)
                s = 'csv_file_dialect()'
                s += ' Re-sniffing with tab to %s' % (readto)
                logging.debug(s)
                sample_text = ''.join(file.readline() for x in xrange(2,4,1))
                dialect = csv.Sniffer().sniff(sample_text)
            # Sorry, couldn't figure it out
            except csv.Error:
                logging.debug('Unable to determine csv dialect')
                return None
    
    # Fill in some standard values for the remaining dialect attributes        
    if dialect.escapechar is None:
        dialect.escapechar='/'

    dialect.skipinitialspace=True
    dialect.strict=False

    return dialect
    def parse_and_import(
            self,
            file_upload: FileUpload,
            headers: Dict[str, int],
            has_headers: Optional[bool] = None,
            # todo: maybe it is better to accept dialect to give more options to configure
            delimiter: Optional[str] = None,
            encoding: str = settings.DEFAULT_CHARSET,
            allow_update: bool = True,
            atomic: bool = False,
            create_failed_rows_file: bool = False,
            detailed_errors_limit: int = 20,
            campaign: Optional[Campaign] = None,
            contact_list: Optional[ContactList] = None) -> ImportResult:

        indexes = {index: header for header, index in headers.items()}

        with file_upload.open() as csv_file:
            csv_format_opts = dict(
                dialect=unicodecsv.excel,
                encoding=encoding,
            )

            try:
                if has_headers is None:
                    has_headers = unicodecsv.Sniffer().has_header(
                        csv_file.read(1024).decode(encoding))
                    csv_file.seek(0)
                if delimiter is None:
                    dialect = unicodecsv.Sniffer().sniff(
                        csv_file.read(1024).decode(encoding))
                    csv_format_opts['dialect'] = dialect
                    csv_file.seek(0)
                else:
                    csv_format_opts['delimiter'] = delimiter

                csv_reader = unicodecsv.reader(csv_file, **csv_format_opts)

                header = next(csv_reader) if has_headers else None

                process_rows = partial(self._process_rows, csv_reader, indexes,
                                       allow_update, atomic,
                                       detailed_errors_limit)
            except (UnicodeDecodeError, unicodecsv.Error) as e:
                raise ParsingException(str(e)) from e

            failed_rows_file_upload = None
            with transaction.atomic(savepoint=False):
                if not create_failed_rows_file:
                    created_contacts, updated_contacts, skipped_contacts, errors = process_rows(
                        None)
                else:
                    with tempfile.TemporaryFile() as fp, transaction.atomic(
                            savepoint=False):
                        csv_writer = unicodecsv.writer(fp, **csv_format_opts)

                        if header:
                            csv_writer.writerow(header)

                        created_contacts, updated_contacts, skipped_contacts, errors = process_rows(
                            csv_writer.writerow)

                        if errors:
                            fp.seek(0)
                            failed_rows_file_upload = FileUpload.objects.create(
                                owner=file_upload.owner,
                                uploader=FileUploader.SYSTEM,
                                ttl=datetime.timedelta(days=2),
                                file=File(
                                    fp,
                                    "failed-rows-from-%s" % file_upload.name))

                if campaign:
                    participating = set(
                        campaign.contacts.values_list('id', flat=True))
                    Participation.objects.bulk_create((Participation(
                        contact_id=contact_id,
                        campaign=campaign,
                    ) for contact_id in chain(
                        created_contacts,
                        filter(
                            lambda contact_id: contact_id not in participating,
                            updated_contacts))))

                if contact_list:
                    contact_list.contacts.add(*created_contacts)
                    contact_list.contacts.add(*updated_contacts)

            return ImportResult(len(created_contacts), len(updated_contacts),
                                len(skipped_contacts), errors,
                                failed_rows_file_upload)
Ejemplo n.º 30
0
def parse_csv(myfile, newsletter, ignore_errors=False):
    """
    Parse addresses from CSV file-object into newsletter.

    Returns a dictionary mapping email addresses into Subscription objects.
    """

    import unicodecsv

    encoding = get_encoding(myfile)

    # Attempt to detect the dialect
    # Ref: https://bugs.python.org/issue5332
    encodedfile = io.TextIOWrapper(myfile, encoding=encoding, newline='')
    dialect = unicodecsv.Sniffer().sniff(encodedfile.read(1024))

    # Reset the file index
    myfile.seek(0)

    logger.info('Detected encoding %s and dialect %s for CSV file',
                encoding, dialect)

    myreader = unicodecsv.reader(myfile, dialect=dialect, encoding=encoding)

    firstrow = next(myreader)

    # Find name column
    colnum = 0
    namecol = None
    for column in firstrow:
        if "name" in column.lower() or _("name") in column.lower():
            namecol = colnum

            if "display" in column.lower() or \
                    _("display") in column.lower():
                break

        colnum += 1

    if namecol is None:
        raise forms.ValidationError(_(
            "Name column not found. The name of this column should be "
            "either 'name' or '%s'.") % _("name")
        )

    logger.debug("Name column found: '%s'", firstrow[namecol])

    # Find email column
    colnum = 0
    mailcol = None
    for column in firstrow:
        if 'email' in column.lower() or \
                'e-mail' in column.lower() or \
                _("e-mail") in column.lower():

            mailcol = colnum

            break

        colnum += 1

    if mailcol is None:
        raise forms.ValidationError(_(
            "E-mail column not found. The name of this column should be "
            "either 'email', 'e-mail' or '%(email)s'.") %
            {'email': _("e-mail")}
        )

    logger.debug("E-mail column found: '%s'", firstrow[mailcol])

    if namecol == mailcol:
        raise forms.ValidationError(
            _(
                "Could not properly determine the proper columns in the "
                "CSV-file. There should be a field called 'name' or "
                "'%(name)s' and one called 'e-mail' or '%(email)s'."
            ) % {
                "name": _("name"),
                "email": _("e-mail")
            }
        )

    logger.debug('Extracting data.')

    address_list = AddressList(newsletter, ignore_errors)

    for row in myreader:
        if not max(namecol, mailcol) < len(row):
            logger.warning(
                "Column count does not match for row number %d",
                myreader.line_num, extra=dict(data={'row': row})
            )

            if ignore_errors:
                # Skip this record
                continue
            else:
                raise forms.ValidationError(_(
                    "Row with content '%(row)s' does not contain a name and "
                    "email field.") % {'row': row}
                )

        address_list.add(
            row[mailcol], row[namecol], location="line %d" % myreader.line_num
        )

    return address_list.addresses