Esempio n. 1
0
def upload_file():
    if request.method == 'POST':
        # check if the post request has the file part
        if 'csv' in request.files:
            # get number of neighbours, default=10
            neighbours = request.args.get('neighbours', '10')
            if not isInt(neighbours):
                flash(
                    'Invalid number of neighbours. Use "neighbours={count}". Default is 10.'
                )
                flask.abort(422, '\n'.join(flask.get_flashed_messages()))
            column = request.args.get('column', '0')
            if not isInt(column):
                flash(
                    'Invalid column index specified. Use "column={index}" parameter. Default is 0'
                )
                flask.abort(422, '\n'.join(flask.get_flashed_messages()))
            file = request.files['csv']
            # if user does not select file, browser also
            # submit a empty part without filename
            if file.filename == '':
                flash('No selected file')
                flask.abort(400, '\n'.join(flask.get_flashed_messages()))
            if file:
                #filename = secure_filename(file.filename)
                reader = anycsv.reader(content=file.read())
                values = [r[int(column)] for r in reader]
                return get_response(values=values, neighbours=int(neighbours))

        flash('Use "csv" parameter to specify file')
    return flask.abort(400, '\n'.join(flask.get_flashed_messages()))
Esempio n. 2
0
def parse_csv(csv, out, bench, bench_out):
    """Inspect a CSV file to figure out about the dialect, comment and header lines and the overall structure."""

    click.echo("Input CSV {}".format(csv))

    table = anycsv.reader(csv)

    if out:
        if out.endswith(".gz"):
           fout = gzip.open(out, 'wt', newline='')
        else:
            fout = open(out, "wt")
    else:
        fout=sys.stdout

    import csv
    writer = csv.writer(fout)

    for row in table:
        writer.writerow(row)

    if out:
        fout.close()

    if bench:
        click.echo("TIMING")
        click.echo(Timer.printStats())
    if bench_out:
        Timer.to_csv(bench_out)
Esempio n. 3
0
    def __init__(self,
                 filename=None,
                 url=None,
                 content=None,
                 sample_size=20,
                 skip_guess_encoding=False,
                 structure_detector=SimpleStructureDetector(),
                 max_file_size=-1):

        self.table = anycsv.reader(filename=filename,
                                   url=url,
                                   content=content,
                                   skip_guess_encoding=skip_guess_encoding,
                                   sniff_lines=sample_size,
                                   max_file_size=max_file_size)

        keys = [
            'encoding', 'url', 'filename', 'delimiter', 'quotechar',
            'lineterminator', 'skipinitialspace', 'quoting', 'delimiter',
            'doublequote'
        ]
        self.meta = {}
        for k, v in self.table.__dict__.items():
            if k in keys and v:
                self.meta[k] = v
        for k, v in self.table.dialect.items():
            if k in keys:
                self.meta[k] = v
        if 'url' in self.meta:
            self.meta['uri'] = self.meta.pop('url')

        #self.meta['dialect'] = self.table.dialect
        log.debug("Input file dialect",
                  dialect=self.table.dialect,
                  encoding=self.meta['encoding'])
        self.sample = []
        for i, row in enumerate(self.table):
            if i >= sample_size:
                break
            self.sample.append(row)

        # i would include emtpy lines for now
        self.descriptionLines = structure_detector.guess_description_lines(
            self.sample)

        if self.descriptionLines is None:
            raise ValueError(
                "structure_detector should return a value, if no description lines exists, return empty"
            )
        # allow multple header lines if existing, if none exist return 0
        self.header_lines = structure_detector.guess_headers(self.sample)
        if self.header_lines is None:
            raise ValueError(
                "structure_detector should return a value, if no header lines exists, return empty"
            )
        self.columns = structure_detector.guess_columns(self.sample)

        self.table.seek_line(
            0)  #len(self.descriptionLines) + len(self.header_lines))
Esempio n. 4
0
def test_single_file():
    csv = "/Users/jumbrich/data/mimesis_csvs/encoding/latin.csv"
    reader = anycsv.reader(csv)

    for row in reader:
        assert len(row) == 9

    assert reader.digest is not None
Esempio n. 5
0
def test_file(tmpdir):
    p = tmpdir.mkdir("tmp.csvs").mkdir("data")
    csv = _create_table(p, rows=200, columns=4, gzipped=True)

    reader = anycsv.reader(csv)

    for row in reader:
        assert len(row) == 4

    assert reader.digest is not None
Esempio n. 6
0
def show_similar_columns(similar_columns):
    print "Found " + str(len(similar_columns)) + " groups of similar columns:"
    for idx, cluster in enumerate(similar_columns):
        # write the cluster of similar columns into a separate csv file
        # df = pd.DataFrame()
        # df.columns = cluster
        similar_columns = []
        print str(len(cluster)) + " similar columns:"
        print cluster
        for column_label in cluster:
            print "Column: " + column_label
            file, col_id = column_label.split('_')
            fp = os.path.join(DATA_PATH, file)  # choose first sample file
            # get the table as a df
            # df = pd.read_csv(fp)
            try:
                csvr = anycsv.reader(filename=fp)
                # skip first 3 lines to avoid description and header lines
                h = csvr.next()
                # h = csvr.next()
                # h = csvr.next()
                # while len(h) <= 1:
                #     # possibly description line
                #     h = csvr.next()
                # setup columns
                columns = [[] for _ in range(len(h))]
                for row in csvr:
                    for i, cell in enumerate(row):
                        columns[i].append(cell.encode('utf-8'))
                # for col_id, c in enumerate(columns):
                #     print col_id
                column = columns[int(col_id)]
                print column[:5]
                similar_columns.append(pd.Series(column))
                # add column to the table
                # adjust the size of the dataframe to accomodate the new column
                # df = df.assign(column_values=pd.Series(column))
                # # label column
                # df = df.rename(index=str, columns={"column_values": column_label})
                # print df
                # print "of length " +  str(len(column))
            except Exception as e:
                print e
        # print len(df.columns)
        # print similar_columns
        df = pd.concat(similar_columns, axis=1).reset_index()
        df.to_csv(RESULTS_PATH + 'cluster' + str(idx) + '.csv')
Esempio n. 7
0
def find_similar_hashed_columns(hashbits=8, limit=None):
    '''
    Find similar columns using simhash
    '''
    hashes = {}
    hash_func = SimHash(hashbits)
    textual_columns_iterator = get_textual_columns(limit)
    for i, column_description in textual_columns_iterator:
        file = column_description['file']
        col_id = column_description['column']
        label = file + '_' + str(col_id)
        # print "Column: " + label
        fp = os.path.join(DATA_PATH, file)  # choose first sample file
        # get the table as a df
        # df = pd.read_csv(fp)
        try:
            csvr = anycsv.reader(filename=fp)
            # skip first 3 lines to avoid description and header lines
            h = csvr.next()
            h = csvr.next()
            h = csvr.next()
            while len(h) <= 1:
                # possibly description line
                h = csvr.next()
            # setup columns
            columns = [[] for _ in range(len(h))]
            for row in csvr:
                for i, cell in enumerate(row):
                    columns[i].append(cell)
            # for col_id, c in enumerate(columns):
            #     print col_id
            column = columns[col_id]
            # print "of length " +  str(len(column))
            hashed_column = hash_column(column, hash_func).hex()
            if hashed_column not in hashes.keys():
                hashes[hashed_column] = []
            hashes[hashed_column].append(label)
        except Exception as e:
            print e

    # print hashes
    similar_columns = [bucket for bucket in hashes.values() if len(bucket) > 1]
    # print similar_columns
    return similar_columns
Esempio n. 8
0
def inspect_csv(csv, bench,bench_out):
    """Inspect a CSV file to figure out about the dialect, comment and header lines and the overall structure."""

    click.echo("Input CSV {}".format(csv))

    reader = anycsv.reader(csv)
    for i,row in enumerate(reader):
        pass

    click.echo("{:-^80}".format(" Table Info "))
    click.echo("    input: {}".format(reader.csv))
    click.echo(" encoding: {}".format(reader.encoding))
    click.echo("      md5: {}".format(reader.digest))
    click.echo("  dialect:")
    for k,v in reader.dialect._asdict().items():
        click.echo("    {}: {}".format(k,v))

    if bench:
        click.echo("TIMING")
        click.echo(Timer.printStats())
    if bench_out:
        Timer.to_csv(bench_out)
Esempio n. 9
0
    def __init__(self, filename=None, url=None, content=None, sample_size=20, skip_guess_encoding=False):

        self.table = anycsv.reader(filename=filename, url=url, content=content, skip_guess_encoding=skip_guess_encoding)

        # copy dialect information to the returning result object
        keys = ["encoding", "url", "filename", "delimiter", "quotechar", "columns"]
        for k, v in self.table.__dict__.items():
            if k in keys:
                if v:
                    setattr(self, k, v)

        self.sample = []
        for i, row in enumerate(self.table):
            # if len(row) != self.columns:
            #    raise ValueError("Row length of "+str(len(row))+" does not match column length of "+str(self.columns))
            if i >= sample_size:
                break
            self.sample.append(row)

        self.description = guess_description_lines(self.sample)
        if self.description:
            self.sample = self.sample[self.description :]

        self.emptyColumns = detect_empty_columns(self.sample)

        self.columns = len(self.sample[self.description])

        self.descriptionLines = []
        for i in range(0, self.description):
            self.descriptionLines.append(self.sample.pop(0))

        self.header_line = guess_headers(self.sample, self.emptyColumns)

        if self.header_line:
            self.table.seek_line(self.description + 1)
        else:
            self.table.seek_line(self.description)
Esempio n. 10
0
        self.length = max(tokens.iteritems(), key=operator.itemgetter(1))[0]


if __name__ == '__main__':
    col_stats = {'labels': defaultdict(int), 'lengths': defaultdict(int), 'columns': 0, 'tables': 0, 'errors': 0}

    classification = [['file', 'column', 'type', 'avg_tokens']]

    for root, subdirs, files in os.walk('tables'):
        for filename in files:
            if filename.endswith('.csv'):
                try:

                    path = os.path.join(root, filename)
                    csvr = anycsv.reader(filename=path)
                    # skip first 3 lines to avoid description and header lines
                    h = csvr.next()
                    h = csvr.next()
                    h = csvr.next()
                    while len(h) <= 1:
                        # possibly description line
                        h = csvr.next()
                    # setup columns
                    columns = [[] for _ in range(len(h))]
                    for row in csvr:
                        for i, cell in enumerate(row):
                            columns[i].append(cell)
                    for col_id, c in enumerate(columns):
                        col = Column(c)
                        col_stats['labels'][col.label] += 1
Esempio n. 11
0
import anycsv
import csv

#reader = anycsv.reader(filename="data.csv")
reader = anycsv.reader(
    url="https://dev.inpher.io/datasets/correlation/test1/bank-full-X.csv")

with open('testfile.csv', 'w') as f:
    writer = csv.writer(f, delimiter='|')
    writer.writerows([row for row in reader])
Esempio n. 12
0
import anycsv
import csv

# filename = 'path/to/file.csv'
# reader = anycsv.reader(filename=filename)

# url = 'http://file.csv'
# reader = anycsv.reader(url=url)

content = 'a,b,c\n1,2,3\n4,5,6'
reader = anycsv.reader(
    url=
    "https://files.datapress.com/calderdale/dataset/domestic-consumption-monitor---monthly-meter-readings/2016-08-31T11:56:15/Domestic"
)

with open('testfile.csv', 'w') as f:
    writer = csv.writer(f, delimiter='|')
    writer.writerows([row for row in reader])
Esempio n. 13
0
import anycsv
import csv

reader = anycsv.reader(filename="data.csv")
# reader = anycsv.reader(url="https://dev.inpher.io/datasets/correlation/test1/bank-full-X.csv")

with open('result.csv', 'w') as f:
    writer = csv.writer(f, delimiter='\t')
    writer.writerows([row for row in reader])
        'labels': defaultdict(int),
        'lengths': defaultdict(int),
        'columns': 0,
        'tables': 0,
        'errors': 0
    }

    classification = [['file', 'column', 'type', 'avg_tokens']]

    for root, subdirs, files in os.walk(PATH):
        for filename in files:
            if filename.endswith('.csv'):
                try:

                    path = os.path.join(root, filename)
                    csvr = anycsv.reader(filename=path)
                    # skip first 3 lines to avoid description and header lines
                    h = csvr.next()
                    h = csvr.next()
                    h = csvr.next()
                    while len(h) <= 1:
                        # possibly description line
                        h = csvr.next()
                    # setup columns
                    columns = [[] for _ in range(len(h))]
                    for row in csvr:
                        for i, cell in enumerate(row):
                            columns[i].append(cell)
                    for col_id, c in enumerate(columns):
                        col = Column(c)
                        col_stats['labels'][col.label] += 1
Esempio n. 15
0
    def from_table(self,
                   filename=None,
                   url=None,
                   content=None,
                   min_matches=0.6,
                   sample_size=300):
        if not filename and not url and not content:
            return None

        sample = []
        cols = []
        col_types = []
        num_cols = 0
        i = 0
        try:
            csvr = anycsv.reader(filename=filename, url=url, content=content)
        except NoDelimiterException:
            csvr = anycsv.reader(filename=filename,
                                 url=url,
                                 content=content,
                                 delimiter=',')

        for i, row in enumerate(csvr):
            if i <= sample_size:
                sample.append(row)
                num_cols = len(row)
            for k, c in enumerate(row):
                if len(cols) == 0:
                    cols = [[] for _ in range(num_cols)]
                    col_types = [defaultdict(int) for _ in range(num_cols)]
                if NUTS_PATTERN.match(c):
                    col_types[k]['NUTS'] += 1
                elif POSTAL_PATTERN.match(c):
                    col_types[k]['POSTAL'] += 1
                cols[k].append(c.strip())
        result = ['' for _ in range(num_cols)]
        disambiguation = [['' for _ in range(sample_size)]
                          for _ in range(num_cols)]
        for col in range(num_cols):
            #  based on col type (90% threshold)
            if 'NUTS' in col_types[col] and col_types[col]['NUTS'] >= i * 0.9:
                disamb, confidence, res_col, source = self.nuts_column(
                    cols[col])
            elif 'POSTAL' in col_types[
                    col] and col_types[col]['POSTAL'] >= i * 0.9:
                disamb, confidence, res_col = self.postalcodes_column(
                    cols[col])
            else:
                disamb, confidence, res_col, source = self.string_column(
                    cols[col])

            if confidence > min_matches:
                disambiguation[col] = disamb
                result[col] = res_col
        return {
            'disambiguation': disambiguation,
            'sample': sample,
            'cols': num_cols,
            'rows': i,
            'tagging': result
        }