Example #1
0
def delete_table(psql_on_db, tablename):
    print "Deleting table..."
    result = envoy.run("""{psql} -c 'DROP TABLE "{tablename}"'""".format(
        psql=psql_on_db, tablename=tablename))
    if result.status_code == 2:
        print_error_result(result, "Error while executing command")
        exit(1)
    elif result.status_code == 1:
        print "Table", tablename, "didn't exist"
    else:
        print "Table", tablename, "dropped successfully"
Example #2
0
def main(args):
    """ the body of the script
    """
    tmpdir = mkdtemp()

    if args.file.startswith(('http://', 'https://')):
        print "The file is in the net, downloading it..."
        file_basename = os.path.basename(
            urllib2.urlparse.urlsplit(args.file).path
        )

        result = envoy.run('wget "{}" -O {}'.format(args.file, file_basename))

        if result.status_code:
            print_error_result(
                result, "Error while downloading RDF data {}. Aborting".format(
                    args.file
                )
            )
            exit(1)

        filename = os.path.join(
            tmpdir, file_basename
        )
    else:
        print "The file is  local, moving it..."
        shutil.copy(args.file, tmpdir)
        filename = os.path.join(tmpdir, os.path.basename(args.file))

    print "handling file", filename

    filename_cropped, extension = os.path.splitext(filename)
    if extension == '.bz2':
        print "Got a bz2 file, need to convert it with gzip"
        gzip_filename = filename_cropped + '.gz'

        result = envoy.run('bunzip2 "{}" -c | gzip > "{}'.format(
            filename, gzip_filename
        ))

        if result.status_code:
            print_error_result(result, "Error while converting file, aborting")
            exit(2)

        filename = gzip_filename
        print "File converted successfully, now handling", filename

    print "Ingesting file in virtuoso"
    virtuoso = get_virtuoso()
    virtuoso.clear(args.graph)
    print "Ingestion completed", virtuoso.ingest(filename, graph=args.graph)
Example #3
0
def delete_table(psql_on_db, tablename):
    print "Deleting table..."
    result = envoy.run(
        """{psql} -c 'DROP TABLE "{tablename}"'""".format(
            psql=psql_on_db, tablename=tablename
        )
    )
    if result.status_code == 2:
        print_error_result(result, "Error while executing command")
        exit(1)
    elif result.status_code == 1:
        print "Table", tablename, "didn't exist"
    else:
        print "Table", tablename, "dropped successfully"
Example #4
0
def main(args):
    """ the body of the script
    """
    tmpdir = mkdtemp()

    if args.file.startswith(('http://', 'https://')):
        print "The file is in the net, downloading it..."
        file_basename = os.path.basename(
            urllib2.urlparse.urlsplit(args.file).path)

        result = envoy.run('wget "{}" -O {}'.format(args.file, file_basename))

        if result.status_code:
            print_error_result(
                result, "Error while downloading RDF data {}. Aborting".format(
                    args.file))
            exit(1)

        filename = os.path.join(tmpdir, file_basename)
    else:
        print "The file is  local, moving it..."
        shutil.copy(args.file, tmpdir)
        filename = os.path.join(tmpdir, os.path.basename(args.file))

    print "handling file", filename

    filename_cropped, extension = os.path.splitext(filename)
    if extension == '.bz2':
        print "Got a bz2 file, need to convert it with gzip"
        gzip_filename = filename_cropped + '.gz'

        result = envoy.run('bunzip2 "{}" -c | gzip > "{}'.format(
            filename, gzip_filename))

        if result.status_code:
            print_error_result(result, "Error while converting file, aborting")
            exit(2)

        filename = gzip_filename
        print "File converted successfully, now handling", filename

    print "Ingesting file in virtuoso"
    virtuoso = get_virtuoso()
    virtuoso.clear(args.graph)
    print "Ingestion completed", virtuoso.ingest(filename, graph=args.graph)
Example #5
0
def main(args):
    DATABASE = settings.TABULAR_DATABASE

    _, ext = os.path.splitext(args.file)
    ext = ext.strip('.').lower()

    print "Detected extension: " + ext

    is_url = args.file.startswith(('http://', 'https://'), )
    if args.inspect:
        if is_url:
            cmd = 'wget "{filename}" -O - | in2csv --no-inference -f {ext} ' \
                  '| csvstat'.format(filename=args.file, ext=ext)
        else:
            cmd = 'csvstat "{filename}"'.format(filename=args.file)

        result = envoy.run(cmd)
        print result.std_out
        exit(0)

    dbname = args.database
    dbport = DATABASE.get("PORT")
    dbhost = DATABASE.get('HOST')
    if not dbname:
        print 'Setting db to default:', DATABASE['NAME']
        dbname = DATABASE['NAME']

    psql_on_db = "psql -d {dbname} -U {dbuser} -p {dbport} -h {dbhost}" \
                 "".format(
                     dbname=dbname,
                     dbuser=DATABASE['USER'],
                     dbport=dbport or '5432',
                     dbhost=dbhost or 'localhost',
                 )

    connection_string = "postgresql://{dbuser}:{dbpass}@" \
                        "{dbhost}:{dbport}/{dbname}".format(
                            dbuser=DATABASE['USER'],
                            dbpass=DATABASE['PASSWORD'],
                            dbhost=dbhost or 'localhost',
                            dbport=dbport or '5432',
                            dbname=dbname
                        )

    print "starting ingestion"

    tablename = args.tablename
    print "table name:", tablename

    if args.erase:
        delete_table(psql_on_db, tablename)
        exit(0)

    delete_table(psql_on_db, tablename)

    delimiter_options = "-d '{}'".format(args.delimiter)
    if args.delimiter == '\\t':
        delimiter_options = '--tabs'
    # hack: quotechar is given always escaped (see tab-configuration.json)
    if args.quotechar.startswith('\\') and len(args.quotechar) > 1:
        args.quotechar = args.quotechar[1:]
    separator = iter(frozenset(('"', "'")) - frozenset(args.quotechar)).next()
    quotechar_options = "-q {0}{1}{0}".format(separator, args.quotechar)

    mime_encoding = args.encoding
    if not mime_encoding:
        result = envoy.run(
            'file --brief --mime-encoding "{}"'.format(args.file)
        )
        mime_encoding = result.std_out.strip()
    print "Using MIME encoding:", mime_encoding

    if is_url:
        cmd = """wget "{filename}" -O - |
        in2csv -f {ext} {delimiter_opts} {quotechar_opts} |
        csvsql --no-constraints {delimiter_opts} {quotechar_opts} --insert --db
        "{connection_string}" --table "{tablename}" --maxfieldsize 10485760
        -e "{mime}" --no-inference -""".format(
            filename=args.file,
            ext=ext,
            connection_string=connection_string,
            tablename=tablename,
            delimiter_opts=delimiter_options,
            quotechar_opts=quotechar_options,
            mime=mime_encoding,
        )

        result = envoy.run(cmd)
        if result.status_code:
            print_error_result(
                result, "Error while ingesting data. Aborting."
            )
            exit(1)
    else:
        # Note: the --maxfieldsize option is set to 10MB

        if ext in ("xls", "xlsx"):
            cmd = r"""
                in2csv -f {ext} {delimiter_opts} {quotechar_opts}
                "{filename}" | csvsql --no-constraints {delimiter_opts}
                {quotechar_opts} --insert -e "{mime}" --db
                "{connection_string}" --table "{tablename}"
                --maxfieldsize 10485760 --no-inference -
            """
        else:
            # Note: the -z option is set to 10MB
            cmd = r"""
                csvsql --no-constraints {delimiter_opts} {quotechar_opts}
                       --insert -e "{mime}" --db "{connection_string}"
                       --table "{tablename}" "{filename}"
                       --maxfieldsize 10485760 --no-inference
                """

        cmd = cmd.format(
            mime=mime_encoding,
            ext=ext,
            filename=args.file,
            connection_string=connection_string,
            tablename=tablename,
            delimiter_opts=delimiter_options,
            quotechar_opts=quotechar_options,
        )

        result = envoy.run(cmd)
        if result.status_code:
            print_error_result(result, "Error while ingesting data" + cmd)
            exit(1)

    print "Ingestion completed."
Example #6
0
def main(args):
    DATABASE = settings.TABULAR_DATABASE

    _, ext = os.path.splitext(args.file)
    ext = ext.strip('.').lower()

    print "Detected extension: " + ext

    is_url = args.file.startswith(('http://', 'https://'), )
    if args.inspect:
        if is_url:
            cmd = 'wget "{filename}" -O - | in2csv --no-inference -f {ext} ' \
                  '| csvstat'.format(filename=args.file, ext=ext)
        else:
            cmd = 'csvstat "{filename}"'.format(filename=args.file)

        result = envoy.run(cmd)
        print result.std_out
        exit(0)

    dbname = args.database
    dbport = DATABASE.get("PORT")
    dbhost = DATABASE.get('HOST')
    if not dbname:
        print 'Setting db to default:', DATABASE['NAME']
        dbname = DATABASE['NAME']

    psql_on_db = "psql -d {dbname} -U {dbuser} -p {dbport} -h {dbhost}" \
                 "".format(
                     dbname=dbname,
                     dbuser=DATABASE['USER'],
                     dbport=dbport or '5432',
                     dbhost=dbhost or 'localhost',
                 )

    connection_string = "postgresql://{dbuser}:{dbpass}@" \
                        "{dbhost}:{dbport}/{dbname}".format(
                            dbuser=DATABASE['USER'],
                            dbpass=DATABASE['PASSWORD'],
                            dbhost=dbhost or 'localhost',
                            dbport=dbport or '5432',
                            dbname=dbname
                        )

    print "starting ingestion"

    tablename = args.tablename
    print "table name:", tablename

    if args.erase:
        delete_table(psql_on_db, tablename)
        exit(0)

    delete_table(psql_on_db, tablename)

    delimiter_options = "-d '{}'".format(args.delimiter)
    if args.delimiter == '\\t':
        delimiter_options = '--tabs'
    # hack: quotechar is given always escaped (see tab-configuration.json)
    if args.quotechar.startswith('\\') and len(args.quotechar) > 1:
        args.quotechar = args.quotechar[1:]
    separator = iter(frozenset(('"', "'")) - frozenset(args.quotechar)).next()
    quotechar_options = "-q {0}{1}{0}".format(separator, args.quotechar)

    mime_encoding = args.encoding
    if not mime_encoding:
        result = envoy.run('file --brief --mime-encoding "{}"'.format(
            args.file))
        mime_encoding = result.std_out.strip()
    print "Using MIME encoding:", mime_encoding

    if is_url:
        cmd = """wget "{filename}" -O - |
        in2csv -f {ext} {delimiter_opts} {quotechar_opts} |
        csvsql --no-constraints {delimiter_opts} {quotechar_opts} --insert --db
        "{connection_string}" --table "{tablename}" --maxfieldsize 10485760
        -e "{mime}" --no-inference -""".format(
            filename=args.file,
            ext=ext,
            connection_string=connection_string,
            tablename=tablename,
            delimiter_opts=delimiter_options,
            quotechar_opts=quotechar_options,
            mime=mime_encoding,
        )

        result = envoy.run(cmd)
        if result.status_code:
            print_error_result(result, "Error while ingesting data. Aborting.")
            exit(1)
    else:
        # Note: the --maxfieldsize option is set to 10MB

        if ext in ("xls", "xlsx"):
            cmd = r"""
                in2csv -f {ext} {delimiter_opts} {quotechar_opts}
                "{filename}" | csvsql --no-constraints {delimiter_opts}
                {quotechar_opts} --insert -e "{mime}" --db
                "{connection_string}" --table "{tablename}"
                --maxfieldsize 10485760 --no-inference -
            """
        else:
            # Note: the -z option is set to 10MB
            cmd = r"""
                csvsql --no-constraints {delimiter_opts} {quotechar_opts}
                       --insert -e "{mime}" --db "{connection_string}"
                       --table "{tablename}" "{filename}"
                       --maxfieldsize 10485760 --no-inference
                """

        cmd = cmd.format(
            mime=mime_encoding,
            ext=ext,
            filename=args.file,
            connection_string=connection_string,
            tablename=tablename,
            delimiter_opts=delimiter_options,
            quotechar_opts=quotechar_options,
        )

        result = envoy.run(cmd)
        if result.status_code:
            print_error_result(result, "Error while ingesting data" + cmd)
            exit(1)

    print "Ingestion completed."