Beispiel #1
0
 def test_delimiters(self):
     sniffer = csv.Sniffer()
     dialect = sniffer.sniff(self.sample3)
     # given that all three lines in sample3 are equal,
     # I think that any character could have been 'guessed' as the
     # delimiter, depending on dictionary order
     self.assertIn(dialect.delimiter, self.sample3)
     dialect = sniffer.sniff(self.sample3, delimiters="?,")
     self.assertEqual(dialect.delimiter, "?")
     dialect = sniffer.sniff(self.sample3, delimiters="/,")
     self.assertEqual(dialect.delimiter, "/")
     dialect = sniffer.sniff(self.sample4)
     self.assertEqual(dialect.delimiter, ";")
     dialect = sniffer.sniff(self.sample5)
     self.assertEqual(dialect.delimiter, "\t")
     dialect = sniffer.sniff(self.sample6)
     self.assertEqual(dialect.delimiter, "|")
     dialect = sniffer.sniff(self.sample7)
     self.assertEqual(dialect.delimiter, "|")
     self.assertEqual(dialect.quotechar, "'")
     dialect = sniffer.sniff(self.sample8)
     self.assertEqual(dialect.delimiter, '+')
     dialect = sniffer.sniff(self.sample9)
     self.assertEqual(dialect.delimiter, '+')
     self.assertEqual(dialect.quotechar, "'")
Beispiel #2
0
def load_data(year):
    '''
    Load data into memory cache
    '''
    year = str(year)
    if year in CACHE:
        return True

    data_file = os.path.join(os.path.dirname(__file__), 'data',
                             '{}.csv'.format(year))
    if not os.path.isfile(data_file):
        return False

    CACHE[year] = {}
    with io.open(data_file, encoding='utf-8') as rf:
        # Detect CSV header line
        has_header = csv.Sniffer().has_header(rf.read(1024))
        rf.seek(0)

        reader = csv.DictReader(rf, DATA_FIELDS)
        if has_header:
            next(reader)

        for data_line in reader:
            day = clean_up_dict(data_line)
            # Convert into `int` type so we don't need to parse it afterwards
            dt = datetime.strptime(day['date'], '%Y-%m-%d')
            day['year'] = dt.year
            day['month'] = dt.month
            day['day'] = dt.day
            day['isholiday'] = bool(int(day['isholiday']))
            day['isworkday'] = bool(int(day['isworkday']))
            CACHE[year][day.pop('date')] = day

    return True
Beispiel #3
0
    def parse(self, csvsrc):
        text, encoding = self.detect_encoding(csvsrc, default_encodings=['utf-8', 'utf-16'])
        #FIXME: raise parse error if encoding detection fails?
        self.encoding = encoding or 'utf-8'

        sniffer = csv.Sniffer()
        sample = text[:1024]

        try:
            self.dialect = sniffer.sniff(sample)
            if self.dialect.quoting == csv.QUOTE_MINIMAL:
                #HACKISH: most probably a default, not real detection
                self.dialect.quoting = csv.QUOTE_ALL
                self.dialect.doublequote = True
        except csv.Error:
            self.dialect = 'default'

        inputfile = csv.StringIO(text)
        try:
            fieldnames = detect_header(inputfile, self.dialect, self.fieldnames)
            self.fieldnames = fieldnames
        except csv.Error:
            pass

        inputfile.seek(0)
        reader = try_dialects(inputfile, self.fieldnames, self.dialect)

        first_row = True
        for row in reader:
            newce = self.UnitClass()
            newce.fromdict(row)
            if not first_row or not newce.match_header():
                self.addunit(newce)
            first_row = False
Beispiel #4
0
    def ingest(self, file_path):
        with io.open(file_path, 'rb') as fh:
            encoding = self.detect_stream_encoding(fh)
            log.debug("Detected encoding [%s]: %s", self.result, encoding)

        with io.open(file_path, 'r', newline='', encoding=encoding) as fh:
            sample = fh.read(4096 * 10)
            fh.seek(0)

            dialect = csv.Sniffer().sniff(sample)
            # dialect.delimiter = dialect.delimiter[0]
            has_header = csv.Sniffer().has_header(sample)

            reader = csv.reader(fh, dialect=dialect)
            rows = self.generate_rows(reader, has_header=has_header)
            self.result.flag(self.result.FLAG_TABULAR)
            self.result.emit_rows(rows)
Beispiel #5
0
    def test_sniff(self):
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(self.sample1)
        self.assertEqual(dialect.delimiter, ",")
        self.assertEqual(dialect.quotechar, '"')
        self.assertEqual(dialect.skipinitialspace, True)

        dialect = sniffer.sniff(self.sample2)
        self.assertEqual(dialect.delimiter, ":")
        self.assertEqual(dialect.quotechar, "'")
        self.assertEqual(dialect.skipinitialspace, False)
Beispiel #6
0
 def test_doublequote(self):
     sniffer = csv.Sniffer()
     dialect = sniffer.sniff(self.header1)
     self.assertFalse(dialect.doublequote)
     dialect = sniffer.sniff(self.header2)
     self.assertFalse(dialect.doublequote)
     dialect = sniffer.sniff(self.sample2)
     self.assertTrue(dialect.doublequote)
     dialect = sniffer.sniff(self.sample8)
     self.assertFalse(dialect.doublequote)
     dialect = sniffer.sniff(self.sample9)
     self.assertTrue(dialect.doublequote)
Beispiel #7
0
def read_csv(inputFilePath, outFilePath, headerFields, fileEncoding, detect):
    encoding = "utf-8"
    outfile = io.open(outFilePath, 'w', encoding=encoding)

    if fileEncoding is not None:
        encoding = fileEncoding
    filetype = ""
    numlines = 0
    with io.open(inputFilePath, "rU", encoding=encoding) as file:
        sniffed = csv.Sniffer().sniff(file.readline())
        while sniffed.delimiter not in ";,\t":
            sniffed = csv.Sniffer().sniff(file.readline())
        file.seek(0)
        outData = ""
        reader = csv.reader(file, delimiter=sniffed.delimiter)
        if detect == "true":
            for i in reader:
                row = ';'.join(i) + '\n'
                if filetype == "":
                    filetype, header = getFileType(row, headerFields, encoding)
                    if header:
                        outData = header
                else:
                    break
        else:
            for i in reader:
                row = ';'.join(i) + '\n'
                if filetype == "":
                    filetype, header = getFileType(row, headerFields, encoding)
                    if header:
                        outData = header
                else:
                    outData = row
                    numlines = numlines + 1
            outfile.write(outData)
            outfile.close()
    return filetype, numlines
Beispiel #8
0
    def __init__(self, file_path, encoding=None, delimiter=None):
        if encoding is None:
            with open(file_path, 'r') as fh:
                data = fh.read(SAMPLE_SIZE)
                encoding = guess_encoding(data)

        self.fh = io.open(file_path, 'r', encoding=encoding)
        data = self.fh.read(SAMPLE_SIZE)
        dialect = csv.Sniffer().sniff(data)
        if delimiter is not None:
            dialect.delimiter = delimiter
        self.fh.seek(0)

        self.reader = iter(csv.reader(self.fh, dialect=dialect))
        self.headers = next(self.reader)
        self.count = 0
Beispiel #9
0
def get_campaign_events_from_csv(file_name):
    BASE_COLUMNS = [
        'relative_to', 'offset', 'unit', 'delivery_hour', 'lang_code',
        'message'
    ]
    with io.open(file_name, encoding='utf-8') as csv_file:
        has_header = csv.Sniffer().has_header(csv_file.read(2048))
        csv_file.seek(0)
        csv_reader = csv.reader(csv_file)
        base_col_len = len(BASE_COLUMNS)
        if has_header:
            header = next(csv_reader)
            length = len(header)
            if length < base_col_len:
                raise ValueError(
                    "Invalid CSV format: A minimum of {0} columns expected".
                    format(base_col_len))
            # Translations are given as optional (lang_code, message) column pairs at the end of each row.
            if length > base_col_len:
                # Translations need to be provided as lang_code, message pairs
                if (length - base_col_len) % 2 != 0:
                    raise ValueError(
                        "Invalid CSV format: Pairs of lang_code, message columns expected"
                    )
                has_translations = True
            else:
                has_translations = False
        for row in csv_reader:
            event = CampaignEvent(relative_to=row[0].strip(),
                                  offset=row[1].strip(),
                                  unit=row[2].strip(),
                                  delivery_hour=row[3].strip(),
                                  message=row[5].strip())
            lang_code = row[4].strip()
            if has_translations:
                translations = row[base_col_len:]
                full_message = {lang_code: event.message}
                iterator = iter(translations)
                translations = zip(iterator, iterator)
                for code, msg in translations:
                    full_message[code.strip()] = msg.strip()
                event.message = json.dumps(full_message, sort_keys=True)
            yield event
Beispiel #10
0
def open_csv(file_path, encoding=None, delimiter=None):
    if encoding is None:
        with io.open(file_path, 'rb') as fh:
            data = fh.read(SAMPLE_SIZE)
            encoding = guess_encoding(data)

    fh = io.open(file_path, 'r', encoding=encoding)
    if delimiter is None:
        data = fh.read(SAMPLE_SIZE)
        dialect = csv.Sniffer().sniff(data)
        delimiter = dialect.delimiter
        fh.seek(0)

    reader = csv.reader(fh, delimiter=delimiter)
    headers = []
    for row in reader:
        headers = row
        break
    fh.seek(0)
    return fh, delimiter, headers
Beispiel #11
0
def read_excel_xlrd(inputFilePath, outFilePath, headerFields, detect):
    wb = xlrd.open_workbook(
        inputFilePath)  #on_demand = True, encoding='cp1252'
    outfile = open(outFilePath, "w")
    filetype = ""
    numlines = 0
    for sheet_name in wb.sheet_names():
        sh = wb.sheet_by_name(sheet_name)
        if sh.ncols == 1:
            sniffed = csv.Sniffer().sniff(''.join(sh.row_values(0)))
            for i in range(sh.nrows):
                row = ''.join(
                    map(lambda e: unicode(e).strip(), ';'.join(
                        sh.row_values(i)).split(sniffed.delimiter))) + '\n'

                if filetype == "":
                    filetype, header = getFileType(row, headerFields)
                    if header:
                        outfile.write(header)
                else:
                    if detect == "true":
                        break
                    outfile.write(row.encode("utf8"))
                    numlines = numlines + 1
        else:
            for i in range(sh.nrows):
                row = ';'.join(
                    map(lambda e: unicode(e).strip(), sh.row_values(i))) + '\n'

                if filetype == "":
                    filetype, header = getFileType(row, headerFields)
                    if header:
                        outfile.write(header)
                else:
                    if detect == "true":
                        break
                    outfile.write(row.encode("utf8"))
                    numlines = numlines + 1
    outfile.close()
    return filetype, numlines
Beispiel #12
0
 def test_has_header_regex_special_delimiter(self):
     sniffer = csv.Sniffer()
     self.assertEqual(sniffer.has_header(self.sample8), False)
     self.assertEqual(sniffer.has_header(self.header2 + self.sample8), True)
Beispiel #13
0
 def test_has_header(self):
     sniffer = csv.Sniffer()
     self.assertEqual(sniffer.has_header(self.sample1), False)
     self.assertEqual(sniffer.has_header(self.header1 + self.sample1), True)