コード例 #1
0
def test_comp_df():
    df = pd.DataFrame({
        "a": [1, 1, 2, 2, 1, 3, 4, 4],
        "b": ["A", "A", "B", "B", "A", "C,D", "D C", "D C"]
    })
    comp_df = lzhw.CompressedDF(df, parallel=True)
    comp_df2 = lzhw.CompressedDF(df, sliding_window=10)
    assert all(comp_df.compressed[1].decompress() == df.b)
    assert all(comp_df2.compressed[0].decompress() == df.a)
コード例 #2
0
ファイル: lzhw_cli.py プロジェクト: MNoorFawi/lzhw
def main():
    ## This script and the solution to convert xlsx into csv was thanks to the answer found here:
    ## https://stackoverflow.com/questions/28766133/faster-way-to-read-excel-files-to-pandas-dataframe
    ## and here: https://stackoverflow.com/questions/1858195/convert-xls-to-csv-on-command-line
    vbscript = """if WScript.Arguments.Count < 3 Then
        WScript.Echo "Please specify the source and the destination files. Usage: ExcelToCsv <xls/xlsx source file> <csv destination file> <worksheet number (starts at 1)>"
        Wscript.Quit
    End If

    csv_format = 6

    Set objFSO = CreateObject("Scripting.FileSystemObject")

    src_file = objFSO.GetAbsolutePathName(Wscript.Arguments.Item(0))
    dest_file = objFSO.GetAbsolutePathName(WScript.Arguments.Item(1))
    worksheet_number = CInt(WScript.Arguments.Item(2))

    Dim oExcel
    Set oExcel = CreateObject("Excel.Application")

    Dim oBook
    Set oBook = oExcel.Workbooks.Open(src_file)
    oBook.Worksheets(worksheet_number).Activate

    oBook.SaveAs dest_file, csv_format

    oBook.Close False
    oExcel.Quit
    """

    def is_number(s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def csv_reader(file, cols, col_arg, nh_arg):
        if nh_arg:
            h = None
        else:
            h = 0
        if col_arg:
            cols_used = cols.split(",")
            if is_number(cols_used[0]):
                cols_used = [int(i) - 1 for i in cols_used]
        else:
            cols_used = None

        data = pd.read_csv(file, header=h, usecols=cols_used)
        data.columns = list(map(str, data.columns))
        return data

    parser = argparse.ArgumentParser(
        description=
        "LZHW is a tabular data compression tool. It is used to compress excel, csv and any flat file. Version: 0.0.10"
    )
    parser.add_argument("-d",
                        "--decompress",
                        help="decompress input into output",
                        action="store_true",
                        default=False)
    parser.add_argument("-f",
                        "--input",
                        help="input file to be (de)compressed",
                        type=str,
                        required=True)
    parser.add_argument("-o",
                        "--output",
                        help="output where to save result",
                        type=str,
                        required=True)
    parser.add_argument(
        "-c",
        "--columns",
        nargs="+",
        help=
        "select specific columns by names or indices (1-based) to compress or decompress",
        type=str,
        required=False)
    parser.add_argument("-r",
                        "--rows",
                        help="select specific rows to decompress (1-based)",
                        type=str,
                        required=False)
    parser.add_argument(
        "-nh",
        "--no-header",
        help="skip header / data to be compressed has no header",
        action="store_true",
        default=False)
    parser.add_argument("-p",
                        "--parallel",
                        help="compress or decompress in parallel",
                        action="store_true",
                        default=False)
    parser.add_argument(
        "-j",
        "--jobs",
        help="Number of CPUs to use if parallel (default all but 2)",
        type=str,
        required=False,
        default="-3")
    args = vars(parser.parse_args())

    file = args["input"]
    output = args["output"]
    para = args["parallel"]
    n_jobs = args["jobs"]

    if args["columns"]:
        cols = args["columns"][0]
    else:
        cols = "all"

    if args["rows"]:
        n_rows = int(args["rows"])
    else:
        n_rows = 0

    if args["decompress"]:
        start = time()
        if cols != "all":
            cols = cols.split(",")
            if is_number(cols[0]):
                cols = [int(i) - 1 for i in cols]

        if para:
            decompressed = lzhw.decompress_df_from_file(file,
                                                        cols,
                                                        n_rows,
                                                        parallel=para,
                                                        n_jobs=int(n_jobs))
        else:
            decompressed = lzhw.decompress_df_from_file(file, cols, n_rows)

        decompressed.fillna("", inplace=True)
        decompressed = decompressed.replace("nan", "", regex=True)
        if "xls" in output:
            options = {}
            options["strings_to_formulas"] = False
            options["strings_to_urls"] = False
            writer = pd.ExcelWriter(output,
                                    engine="xlsxwriter",
                                    options=options)
            decompressed.to_excel(writer, output.split(".xls")[0], index=False)
            writer.save()
        if "csv" in output:
            decompressed.to_csv(output, index=False)
        else:
            with open(output, "w") as o:
                decompressed.to_string(o, index=False)
        print("Finalizing Decompression ...")
        print(f"Creating {output} file ...")
        print("time taken: ", (time() - start) / 60, " minutes")
        print("Decompressed Successfully")

    else:
        start = time()
        if "xls" in file:
            print(
                "Reading files, Can take 1 minute or something ...",
                "\nRunning CScript.exe to convert xls file to csv for better performance",
                "\n")
            f = open("excel_to_csv.vbs", "w")
            f.write(vbscript)
            f.close()
            csv_file = file.split(".xls")[0] + "1" + ".csv"
            call(["cscript.exe", "excel_to_csv.vbs", file, csv_file, "1"])
            os.remove("excel_to_csv.vbs")

            data = csv_reader(csv_file, cols, args["columns"],
                              args["no_header"])

            os.remove(csv_file)

        elif "csv" in file:
            print("Reading files ...")
            data = csv_reader(file, cols, args["columns"], args["no_header"])

        else:
            with open(file, "r") as i:
                data = i.read()

        if para:
            comp_df = lzhw.CompressedDF(data,
                                        parallel=para,
                                        n_jobs=int(n_jobs))
        else:
            comp_df = lzhw.CompressedDF(data)

        print("Finalizing Compression ...")
        comp_df.save_to_file(output)
        print(f"Creating {output} file ...")
        print("time taken: ", (time() - start) / 60, " minutes")
        print("Compressed Successfully")
コード例 #3
0
parser.add_argument("-o", "--output", help="output where to save result",
                    type=str, required=True)
args = vars(parser.parse_args())

file = args["input"]
output = args["output"]

if args["decompress"]:
    decompressed = lzhw.decompress_df_from_file(file)
    if "xls" in output:
        decompressed.to_excel(output, index=False)
    if "csv" in output:
        decompressed.to_csv(output, index=False)
    else:
        with open(output, "w") as o:
            decompressed.to_string(o, index=False)
    print("decompressed successfully")

else:
    if "xls" in file:
        data = pd.read_excel(file)
    if "csv" in file:
        data = pd.read_csv(file)
    else:
        with open(file, "r") as i:
            data = i.read()

    comp_df = lzhw.CompressedDF(data)
    comp_df.save_to_file(output)
    print("compressed successfully")
コード例 #4
0
ファイル: test_lzhw.py プロジェクト: mma1979/lzhw
def test_comp_df():
    df = pd.DataFrame({"a": [1, 1, 2, 2, 1, 3, 4, 4],
                       "b": ["A", "A", "B", "B", "A", "C,D", "D C", "D C"]})
    comp_df = lzhw.CompressedDF(df)
    assert comp_df.compressed[1].decompress() == list(map(str, df.b))
コード例 #5
0
import flask
import lzhw
from time import time
import pandas as pd
from flask import jsonify, request

app = flask.Flask(__name__)
app.config["DEBUG"] = True

df = pd.read_csv("1500000 Sales Records.csv")
comp = lzhw.CompressedDF(df)
for i in range(len(df.columns)):
    comp.compressed[i].compressed = [
        bin(i)[2:] for i in comp.compressed[i].compressed
    ]


@app.route('/full', methods=['GET'])
def get_full():
    col = int(request.args["col"])
    start = time()
    try:
        return df.iloc[:, col].to_json()
    finally:
        print(time() - start)


@app.route('/compressed', methods=['GET'])
def get_compressed():
    col = int(request.args["col"])
    try: