Beispiel #1
0
 def guess_dialect(self, sample):
     sniffer = Sniffer()
     try:
         dialect = sniffer.sniff(sample)
         has_header = sniffer.has_header(sample)
     except Error:  # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone"
         has_header = False  # lets just guess the value
         s = sample.split("\n")[
             1]  # we dont take header (there is no empty column for sure)
         delimiter = ""
         for dl in (",", ";",
                    "|"):  # lets suppose the doubled sign is delimiter
             if s.find(dl + dl) > -1:
                 delimiter = dl
                 break
         if not delimiter:  # try find anything that ressembles delimiter
             for dl in (",", ";", "|"):
                 if s.find(dl) > -1:
                     delimiter = dl
                     break
         dialect = csv.unix_dialect
         dialect.delimiter = delimiter
     if not dialect.escapechar:
         dialect.escapechar = '\\'
     # dialect.quoting = 3
     dialect.doublequote = True
     return dialect, has_header
Beispiel #2
0
 def guess_dialect(self, sample):
     sniffer = Sniffer()
     try:
         dialect = sniffer.sniff(sample)
         has_header = sniffer.has_header(sample)
     except Error:  # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone"
         if sample.strip() == "":
             print("The file seems empty")
             quit()
         has_header = False  # lets just guess the value
         try:
             s = sample.split("\n")[1]  # we dont take header (there is no empty column for sure)
         except IndexError:  # there is a single line in the file
             s = sample.split("\n")[0]
         delimiter = ""
         for dl in (",", ";", "|"):  # lets suppose the doubled sign is delimiter
             if s.find(dl + dl) > -1:
                 delimiter = dl
                 break
         if not delimiter:  # try find anything that ressembles delimiter
             for dl in (",", ";", "|"):
                 if s.find(dl) > -1:
                     delimiter = dl
                     break
         dialect = csv.unix_dialect
         dialect.delimiter = delimiter
     if not dialect.escapechar:
         dialect.escapechar = '\\'
     # dialect.quoting = 3
     dialect.doublequote = True
     return dialect, has_header
Beispiel #3
0
 def __init__(self, inFile):
     from csv import Sniffer, reader
     csvFile = open(inFile, 'r')
     sample = csvFile.read(1024)
     csvFile.seek(0)
     self.reader = reader(csvFile, Sniffer.sniff(sample))
     if Sniffer.has_header(sample):
         self.varNames = next(self.reader)
     else:
         self.varNames = None
 def __init__(self, inFile):
     from csv import Sniffer, reader
     csvFile = open(inFile, 'r')
     sample = csvFile.read(1024)
     csvFile.seek(0)
     self.reader = reader(csvFile, Sniffer.sniff(sample))
     if Sniffer.has_header(sample):
         self.varNames = next(self.reader)
     else:
         self.varNames = None
Beispiel #5
0
def getDelimiter(path):
    sniffer = Sniffer()
    with open(path, 'r') as rfile:
        header = rfile.readline()
        sample = header + rfile.readline() + rfile.readline()
    try:
        asniff = sniffer.sniff(sample, delimiters=";, ")
    except Exception:

        class tsniff(object):
            lineterminator = "\n"
            delimiter = ","

        asniff = tsniff()
        asniff.lineterminator = "\n"
    return asniff.delimiter, sniffer.has_header(sample)
Beispiel #6
0
def p_csv(dialect: Optional[str], padding: bool) -> int:
    data = stdin.read()
    joe_biden = Sniffer()
    has_header = joe_biden.has_header(data)

    try:
        if not has_header:
            print(data, end="")
            return 0
        else:
            d = dialect or joe_biden.sniff(data)
            r = _read(data, dialect=d, padding=padding)
            w = writer(stdout, dialect=d)
            w.writerows(r)
    except CSVErr as e:
        log.critical("%s", f"{ERROR}{linesep}{e}")
        return 1
    else:
        return 0
Beispiel #7
0
    def read(filename):

        with open(filename, "r") as csvfile:

            sniffer = Sniffer()
            sample = csvfile.read(1024)
            dialect = sniffer.sniff(sample, delimiters=[';', ','])

            if sniffer.has_header(sample):
                # file has header
                pass

            csvfile.seek(0)

            lines_reader = DictReader(csvfile, dialect=dialect)

            lines = []
            for line in lines_reader:
                lines.append(line)

            return lines
Beispiel #8
0
    def read(filename):

        with open(filename, "r") as csvfile:

            sniffer = Sniffer()
            sample = csvfile.read(4096)
            dialect = sniffer.sniff(sample, delimiters=[';', ','])

            if sniffer.has_header(sample):
                # file has header
                pass

            csvfile.seek(0)

            lines_reader = DictReader(csvfile, dialect=dialect)

            lines = []
            for line in lines_reader:
                lines.append(line)

            return lines, lines_reader.fieldnames
Beispiel #9
0
    def guess_dialect(sample):
        sniffer = Sniffer()
        sample_text = "".join(sample)
        try:
            dialect = sniffer.sniff(sample_text)
            has_header = sniffer.has_header(sample_text)
            if re.match(
                    "[a-z]", dialect.delimiter.lower()
            ):  # we do not allow letters to be delimiters, seems like non-sense
                raise Error
        except Error:  # delimiter failed – maybe there is an empty column: "89.187.1.81,06-05-2016,,CZ,botnet drone"
            if sample_text.strip() == "":
                print("The file seems empty")
                quit()

            # header detection
            l = [line.strip() for line in sample]
            if len(l[1:]) > 0:
                header_to_rows_similarity = mean(
                    [SequenceMatcher(None, l[0], it).ratio() for it in l[1:]])
                if len(l[1:]) > 1:
                    rows_similarity = mean([
                        SequenceMatcher(None, *comb).ratio()
                        for comb in itertools.combinations(l[1:], 2)
                    ])
                    has_header = rows_similarity > header_to_rows_similarity + 0.1  # it seems that first line differs -> header
                else:
                    has_header = header_to_rows_similarity < 0.5
            else:
                has_header = False

            try:
                s = sample[
                    1]  # we dont take header (there is no empty column for sure)
            except IndexError:  # there is a single line in the file
                s = sample[0]
            delimiter = ""
            for dl in (",", ";",
                       "|"):  # lets suppose the doubled sign is delimiter
                if s.find(dl + dl) > -1:
                    delimiter = dl
                    break
            if not delimiter:  # try find anything that resembles to a delimiter
                for dl in (",", ";", "|"):
                    if s.find(dl) > -1:
                        delimiter = dl
                        break
            dialect = csv.unix_dialect
            if delimiter:
                dialect.delimiter = delimiter
        if not dialect.escapechar:
            dialect.escapechar = '\\'
        # dialect.quoting = 3
        dialect.doublequote = True

        seems_single = False
        if len(sample) == 1:
            # there is single line in sample = in the input, so this is definitely not a header
            has_header = False
            if dialect.delimiter not in [".", ",", "\t"
                                         ] and "|" not in sample_text:
                # usecase: short one-line like "convey hello" would produce stupid "l" delimiter
                # XX should be None maybe, let's think a whole row is a single column – but then we could not add columns
                dialect.delimiter = "|"
                seems_single = True
        if dialect.delimiter == "." and "," not in sample_text:
            # let's propose common use case (bare list of IP addresses) over a strange use case with "." delimiting
            dialect.delimiter = ","
        return dialect, has_header, seems_single