Python rdoの例

プログラミング言語: Python

名前空間/パッケージ名: _rdo

メソッド/関数: rdo

hotexamples.comのコード掲載数: 6

Python rdo - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonの_rdo.rdoの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: format_raw_data.py プロジェクト: jdmmiranda307/dataviva-scripts

def main(file_path, trade_flow, year, eci_file_path, pci_file_path,
         output_path, prev_path, prev5_path):
    start = time.time()
    step = 0

    depths = {"bra": [1, 3, 5, 7, 8, 9], "hs": [2, 6], "wld": [2, 5]}

    step += 1
    print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step)
    secex_df = to_df(file_path, False)
    secex_df = secex_df.head(1000)
    sys.exit()

    step += 1
    print '''\nSTEP {0}: \nAggregate'''.format(step)
    ybpw = aggregate(secex_df)

    step += 1
    print '''\nSTEP {0}: \nShard'''.format(step)
    [yb, ybp, ybw, yp, ypw, yw] = shard(ybpw, depths)

    if trade_flow == "export":
        step += 1
        print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step)
        [yp, yw] = pci_wld_eci(eci_file_path, pci_file_path, yp, yw)

        step += 1
        print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        yb = domestic_eci(yp, yb, ybp, depths)

    step += 1
    print '''\nSTEP {0}: \nCalculate diversity'''.format(step)
    yb = calc_diversity(ybp, yb, "bra_id", "hs_id", depths)
    yb = calc_diversity(ybw, yb, "bra_id", "wld_id", depths)
    yp = calc_diversity(ybp, yp, "hs_id", "bra_id", depths)
    yp = calc_diversity(ypw, yp, "hs_id", "wld_id", depths)
    yw = calc_diversity(ybw, yw, "wld_id", "bra_id", depths)
    yw = calc_diversity(ypw, yw, "wld_id", "hs_id", depths)

    if trade_flow == "export":
        step += 1
        print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step)
        yp = brazil_rca(yp, year)

    if trade_flow == "export":
        step += 1
        print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(
            step)
        ybp = rdo(ybp, yp, year, depths)
    if trade_flow == "import":
        step += 1
        print '''\nSTEP {0}: \nCalculate RCD calculation'''.format(step)
        ybp = rcd(ybp, yp, year, depths)

    # print ybp.head(20)
    # sys.exit()

    tables = {
        "yb": yb,
        "yp": yp,
        "yw": yw,
        "ybp": ybp,
        "ybpw": ybpw,
        "ybw": ybw,
        "ypw": ypw
    }

    if prev_path:
        step += 1
        print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step)
        if prev5_path:
            step += 1
            print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step)
        for t_name, t in tables.items():
            prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] +
                                           list(t_prev.index.names)[:-1])

            t = calc_growth(t, t_prev)

            if prev5_path:
                prev_file = os.path.join(prev5_path,
                                         "{0}.tsv.bz2".format(t_name))
                t_prev = to_df(prev_file, t_name)
                t_prev = t_prev.reset_index(level="year")
                t_prev["year"] = int(year)
                t_prev = t_prev.set_index("year", append=True)
                t_prev = t_prev.reorder_levels(["year"] +
                                               list(t_prev.index.names)[:-1])

                t = calc_growth(t, t_prev, 5)

    print "computing column lengths"
    for table_name, table_data in tables.items():
        tables[table_name] = add_column_length(table_name, table_data)

    print '''\nFINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(
            os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

    total_run_time = (time.time() - start) / 60
    print
    print
    print "Total runtime: {0} minutes".format(int(total_run_time))
    print
    print

コード例 #2

ファイルを表示

def main(export_file_path, import_file_path, year, eci_file_path,
         pci_file_path, ypw_file_path, output_path, prev_path, prev5_path):
    output_path = os.path.join(output_path, str(year))
    start = time.time()
    step = 0

    depths = {"bra": [1, 3, 5, 7, 9], "hs": [2, 6], "wld": [2, 5]}

    if not os.path.exists(output_path):
        os.makedirs(output_path)
    d = pd.HDFStore(os.path.join(output_path, 'secex.h5'))
    # if "ymb" in d:
    if "ymbp" in d:
        tables = {}
        tables["ymb"] = d["ymb"]
        tables["ymp"] = d["ymp"]
        tables["ymw"] = d["ymw"]
        tables["ymbp"] = d["ymbp"]
        tables["ymbw"] = d["ymbw"]
        tables["ympw"] = d["ympw"]
        tables["ymbpw"] = d["ymbpw"]
    else:
        step += 1
        print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step)
        secex_exports = to_df(export_file_path, False)
        secex_imports = to_df(import_file_path, False)

        step += 1
        print '''\nSTEP {0}: \nMerge imports and exports'''.format(step)
        secex_df = merge(secex_exports, secex_imports)

        step += 1
        print '''\nSTEP {0}: \nAggregate'''.format(step)
        ymbpw = aggregate(secex_df)

        step += 1
        print '''\nSTEP {0}: \nShard'''.format(step)
        [ymb, ymbp, ymbw, ymp, ympw, ymw] = shard(ymbpw)

        step += 1
        print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step)
        [ymp, ymw] = pci_wld_eci(eci_file_path, pci_file_path, ymp, ymw, year)

        step += 1
        print '''\nSTEP {0}: \nCalculate diversity'''.format(step)
        ymb = calc_diversity(ymbp, ymb, "bra_id", "hs_id")
        ymb = calc_diversity(ymbw, ymb, "bra_id", "wld_id")
        ymp = calc_diversity(ymbp, ymp, "hs_id", "bra_id")
        ymp = calc_diversity(ympw, ymp, "hs_id", "wld_id")
        ymw = calc_diversity(ymbw, ymw, "wld_id", "bra_id")
        ymw = calc_diversity(ympw, ymw, "wld_id", "hs_id")

        step += 1
        print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"])

        step += 1
        print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"])

        step += 1
        print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step)
        ymp = brazil_rca(ymp, ypw_file_path, year)

        step += 1
        print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(
            step)
        ymbp = rdo(ymbp, ymp, year, depths["bra"], ypw_file_path)

        tables = {
            "ymb": ymb,
            "ymp": ymp,
            "ymw": ymw,
            "ymbp": ymbp,
            "ymbpw": ymbpw,
            "ymbw": ymbw,
            "ympw": ympw
        }
        for tbln, tbl in tables.items():
            d[tbln] = tbl

    if prev_path:
        step += 1
        print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step)
        if prev5_path:
            step += 1
            print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step)
        for t_name, t in tables.items():
            print t_name
            prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] +
                                           list(t_prev.index.names)[:-1])

            t = calc_growth(t, t_prev)

            if prev5_path:
                prev_file = os.path.join(prev5_path,
                                         "{0}.tsv.bz2".format(t_name))
                t_prev = to_df(prev_file, t_name)
                t_prev = t_prev.reset_index(level="year")
                t_prev["year"] = int(year)
                t_prev = t_prev.set_index("year", append=True)
                t_prev = t_prev.reorder_levels(["year"] +
                                               list(t_prev.index.names)[:-1])

                t = calc_growth(t, t_prev, 5)

    print "computing column lengths"
    for table_name, table_data in tables.items():
        tables[table_name] = add_column_length(table_name, table_data)

    print '''\nFINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(
            os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

    total_run_time = (time.time() - start) / 60
    print
    print
    print "Total runtime: {0} minutes".format(int(total_run_time))
    print
    print

コード例 #3

ファイルを表示

ファイル: format_raw_data.py プロジェクト: DataViva/dataviva-scripts

def main(file_path, year, output_path, prev_path, prev5_path, requireds_only):

    print; print "~~~**** YEAR: {0} ****~~~".format(year); print;
    start = time.time()
    step = 0
    # regions state, meso, micro, planning region, munic
    depths = {
        "bra": [1, 3, 5, 7, 8, 9],
        "cnae": [1, 3, 6],
        "cbo": [1, 4],
        "demo": [1, 4]
    }

    if file_path:
        if not os.path.exists(output_path): os.makedirs(output_path)
        d = pd.HDFStore(os.path.join(output_path, 'rais_df_raw.h5'))
        if "rais_df" in d:
            rais_df = d['rais_df']
        else:
            step+=1; print; print '''STEP {0}: \nImport file to pandas dataframe'''.format(step)
            rais_df = to_df(file_path, False)
            try:
                d['rais_df'] = rais_df
                # d.close()
            except OverflowError:
                print "WARNING: Unable to save dataframe, Overflow Error."
                d.close()
                os.remove(os.path.join(output_path, 'rais_df_raw.h5'))
        # rais_df = to_df(file_path, False)

        if "yb" in d and not requireds_only:
            tables = {"yb":d["yb"], "yo":d["yo"], "yi":d["yi"], "ybi":d["ybi"], "ybo":d["ybo"], "yio":d["yio"], "ybio":d["ybio"]}
        else:
            step+=1; print; print '''STEP {0}: \nAggregate'''.format(step)
            tables = aggregate(rais_df, depths)

            step+=1; print; print 'STEP {0}: \nImportance'.format(step)
            tables["yio"] = importance(tables["ybio"], tables["ybi"], tables["yio"], tables["yo"], year, depths)

            try:
                d["yb"] = tables["yb"]; d["yo"] =  tables["yo"]; d["yi"] =  tables["yi"]; d["ybi"] = tables["ybi"]; d["ybo"] = tables["ybo"]; d["yio"] = tables["yio"]; d["ybio"] = tables["ybio"]
                d.close()
            except OverflowError:
                print "WARNING: Unable to save dataframe, Overflow Error."
                d.close()
                os.remove(os.path.join(output_path, 'rais_df_raw.h5'))

        step+=1; print; print 'STEP {0}: \nRequired'.format(step)
        [tables["ybi"], tables["ybio"]] = required(tables["ybio"], tables["ybi"], tables["yi"], year, depths, output_path)

        # print tables["ybi"].head()
        # sys.exit()

        step+=1; print; print 'STEP {0}: \nDiversity'.format(step)
        tables["yb"] = calc_diversity(tables["ybi"], tables["yb"], "bra_id", "cnae_id", year, depths)
        tables["yb"] = calc_diversity(tables["ybo"], tables["yb"], "bra_id", "cbo_id", year, depths)
        tables["yi"] = calc_diversity(tables["ybi"], tables["yi"], "cnae_id", "bra_id", year, depths)
        tables["yi"] = calc_diversity(tables["yio"], tables["yi"], "cnae_id", "cbo_id", year, depths)
        tables["yo"] = calc_diversity(tables["ybo"], tables["yo"], "cbo_id", "bra_id", year, depths)
        tables["yo"] = calc_diversity(tables["yio"], tables["yo"], "cbo_id", "cnae_id", year, depths)

        step+=1; print; print 'STEP {0}: \nCalculate RCA, diversity and opportunity gain aka RDO'.format(step)
        tables["ybi"] = rdo(tables["ybi"], tables["yi"], year, depths)

        for table_name, table_data in tables.items():
            table_data = add_column_length(table_name, table_data)

        print; print '''FINAL STEP: \nSave files to output path'''
        for t_name, t in tables.items():
            new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
            t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")

    if prev_path:
        print; print '''Calculating growth:'''
        for current_year_file_path in findFiles(output_path, '*.tsv.bz2'):
            if "growth" in current_year_file_path: continue
            current_year_file_name = os.path.basename(current_year_file_path)
            prev_year_file_path = os.path.join(prev_path, current_year_file_name)
            prev5_year_file_path = None
            if prev5_path:
                prev5_year_file_path = os.path.join(prev5_path, current_year_file_name)
            if not os.path.exists(prev_year_file_path):
                print "Unable to find", current_year_file_name, "for previous year."
                continue
            tbl_name, tbl_w_growth = calc_growth(year, current_year_file_path, prev_year_file_path, prev5_year_file_path)
            print tbl_name
            new_file_path = os.path.abspath(os.path.join(output_path, "{0}_growth.tsv.bz2".format(tbl_name)))
            tbl_w_growth.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
            # os.remove(current_year_file_path)


    print("--- %s minutes ---" % str((time.time() - start)/60))

コード例 #4

ファイルを表示

ファイル: format_raw_data.py プロジェクト: DataViva/dataviva-scripts

def main(file_path, trade_flow, year, eci_file_path, pci_file_path, output_path, prev_path, prev5_path):
    start = time.time()
    step = 0
    
    depths = {
        "bra": [1, 3, 5, 7, 8, 9],
        "hs": [2, 6],
        "wld": [2, 5]
    }
    
    step += 1; print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step)
    secex_df = to_df(file_path, False)
    secex_df = secex_df.head(1000)
    sys.exit()

    step += 1; print '''\nSTEP {0}: \nAggregate'''.format(step)
    ybpw = aggregate(secex_df)

    step += 1; print '''\nSTEP {0}: \nShard'''.format(step)
    [yb, ybp, ybw, yp, ypw, yw] = shard(ybpw, depths)

    if trade_flow == "export":
        step += 1; print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step)
        [yp, yw] = pci_wld_eci(eci_file_path, pci_file_path, yp, yw)

        step += 1; print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        yb = domestic_eci(yp, yb, ybp, depths)

    step += 1; print '''\nSTEP {0}: \nCalculate diversity'''.format(step)
    yb = calc_diversity(ybp, yb, "bra_id", "hs_id", depths)
    yb = calc_diversity(ybw, yb, "bra_id", "wld_id", depths)
    yp = calc_diversity(ybp, yp, "hs_id", "bra_id", depths)
    yp = calc_diversity(ypw, yp, "hs_id", "wld_id", depths)
    yw = calc_diversity(ybw, yw, "wld_id", "bra_id", depths)
    yw = calc_diversity(ypw, yw, "wld_id", "hs_id", depths)

    if trade_flow == "export":
        step += 1; print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step)
        yp = brazil_rca(yp, year)
    
    if trade_flow == "export":
        step += 1; print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(step)
        ybp = rdo(ybp, yp, year, depths)
    if trade_flow == "import":
        step += 1; print '''\nSTEP {0}: \nCalculate RCD calculation'''.format(step)
        ybp = rcd(ybp, yp, year, depths)
    
    # print ybp.head(20)
    # sys.exit()
    
    tables = {"yb": yb, "yp": yp, "yw": yw, "ybp": ybp, "ybpw": ybpw, "ybw": ybw, "ypw": ypw}
    
    if prev_path:
        step += 1; print '''\nSTEP {0}: \nCalculate 1 year growth'''.format(step)
        if prev5_path:
            step += 1; print '''\nSTEP {0}: \nCalculate 5 year growth'''.format(step)
        for t_name, t in tables.items():
            prev_file = os.path.join(prev_path, "{0}.tsv.bz2".format(t_name))
            t_prev = to_df(prev_file, t_name)
            t_prev = t_prev.reset_index(level="year")
            t_prev["year"] = int(year)
            t_prev = t_prev.set_index("year", append=True)
            t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1])
            
            t = calc_growth(t, t_prev)
            
            if prev5_path:
                prev_file = os.path.join(prev5_path, "{0}.tsv.bz2".format(t_name))
                t_prev = to_df(prev_file, t_name)
                t_prev = t_prev.reset_index(level="year")
                t_prev["year"] = int(year)
                t_prev = t_prev.set_index("year", append=True)
                t_prev = t_prev.reorder_levels(["year"] + list(t_prev.index.names)[:-1])
                
                t = calc_growth(t, t_prev, 5)

    print "computing column lengths"
    for table_name, table_data in tables.items():
        tables[table_name] = add_column_length(table_name, table_data)

    print '''\nFINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)
    
    total_run_time = (time.time() - start) / 60
    print; print;
    print "Total runtime: {0} minutes".format(int(total_run_time))
    print; print;

コード例 #5

ファイルを表示

ファイル: format_raw_data.py プロジェクト: jdmmiranda307/dataviva-scripts

def main(file_path, year, output_path, prev_path, prev5_path, requireds_only):

    print
    print "~~~**** YEAR: {0} ****~~~".format(year)
    print
    start = time.time()
    step = 0
    # regions state, meso, micro, planning region, munic
    depths = {
        "bra": [1, 3, 5, 7, 8, 9],
        "cnae": [1, 3, 6],
        "cbo": [1, 4],
        "demo": [1, 4]
    }

    if file_path:
        if not os.path.exists(output_path): os.makedirs(output_path)
        d = pd.HDFStore(os.path.join(output_path, 'rais_df_raw.h5'))
        if "rais_df" in d:
            rais_df = d['rais_df']
        else:
            step += 1
            print
            print '''STEP {0}: \nImport file to pandas dataframe'''.format(
                step)
            rais_df = to_df(file_path, False)
            try:
                d['rais_df'] = rais_df
                # d.close()
            except OverflowError:
                print "WARNING: Unable to save dataframe, Overflow Error."
                d.close()
                os.remove(os.path.join(output_path, 'rais_df_raw.h5'))
        # rais_df = to_df(file_path, False)

        if "yb" in d and not requireds_only:
            tables = {
                "yb": d["yb"],
                "yo": d["yo"],
                "yi": d["yi"],
                "ybi": d["ybi"],
                "ybo": d["ybo"],
                "yio": d["yio"],
                "ybio": d["ybio"]
            }
        else:
            step += 1
            print
            print '''STEP {0}: \nAggregate'''.format(step)
            tables = aggregate(rais_df, depths)

            step += 1
            print
            print 'STEP {0}: \nImportance'.format(step)
            tables["yio"] = importance(tables["ybio"], tables["ybi"],
                                       tables["yio"], tables["yo"], year,
                                       depths)

            try:
                d["yb"] = tables["yb"]
                d["yo"] = tables["yo"]
                d["yi"] = tables["yi"]
                d["ybi"] = tables["ybi"]
                d["ybo"] = tables["ybo"]
                d["yio"] = tables["yio"]
                d["ybio"] = tables["ybio"]
                d.close()
            except OverflowError:
                print "WARNING: Unable to save dataframe, Overflow Error."
                d.close()
                os.remove(os.path.join(output_path, 'rais_df_raw.h5'))

        step += 1
        print
        print 'STEP {0}: \nRequired'.format(step)
        [tables["ybi"],
         tables["ybio"]] = required(tables["ybio"], tables["ybi"],
                                    tables["yi"], year, depths, output_path)

        # print tables["ybi"].head()
        # sys.exit()

        step += 1
        print
        print 'STEP {0}: \nDiversity'.format(step)
        tables["yb"] = calc_diversity(tables["ybi"], tables["yb"], "bra_id",
                                      "cnae_id", year, depths)
        tables["yb"] = calc_diversity(tables["ybo"], tables["yb"], "bra_id",
                                      "cbo_id", year, depths)
        tables["yi"] = calc_diversity(tables["ybi"], tables["yi"], "cnae_id",
                                      "bra_id", year, depths)
        tables["yi"] = calc_diversity(tables["yio"], tables["yi"], "cnae_id",
                                      "cbo_id", year, depths)
        tables["yo"] = calc_diversity(tables["ybo"], tables["yo"], "cbo_id",
                                      "bra_id", year, depths)
        tables["yo"] = calc_diversity(tables["yio"], tables["yo"], "cbo_id",
                                      "cnae_id", year, depths)

        step += 1
        print
        print 'STEP {0}: \nCalculate RCA, diversity and opportunity gain aka RDO'.format(
            step)
        tables["ybi"] = rdo(tables["ybi"], tables["yi"], year, depths)

        for table_name, table_data in tables.items():
            table_data = add_column_length(table_name, table_data)

        print
        print '''FINAL STEP: \nSave files to output path'''
        for t_name, t in tables.items():
            new_file_path = os.path.abspath(
                os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
            t.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                     sep="\t",
                     index=True,
                     float_format="%.3f")

    if prev_path:
        print
        print '''Calculating growth:'''
        for current_year_file_path in findFiles(output_path, '*.tsv.bz2'):
            if "growth" in current_year_file_path: continue
            current_year_file_name = os.path.basename(current_year_file_path)
            prev_year_file_path = os.path.join(prev_path,
                                               current_year_file_name)
            prev5_year_file_path = None
            if prev5_path:
                prev5_year_file_path = os.path.join(prev5_path,
                                                    current_year_file_name)
            if not os.path.exists(prev_year_file_path):
                print "Unable to find", current_year_file_name, "for previous year."
                continue
            tbl_name, tbl_w_growth = calc_growth(year, current_year_file_path,
                                                 prev_year_file_path,
                                                 prev5_year_file_path)
            print tbl_name
            new_file_path = os.path.abspath(
                os.path.join(output_path,
                             "{0}_growth.tsv.bz2".format(tbl_name)))
            tbl_w_growth.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                                sep="\t",
                                index=True,
                                float_format="%.3f")
            # os.remove(current_year_file_path)

    print("--- %s minutes ---" % str((time.time() - start) / 60))

コード例 #6

ファイルを表示

ファイル: format_raw_data.py プロジェクト: diogolundberg/dataviva-scripts

def main(export_file_path, import_file_path, year, eci_file_path, pci_file_path, ypw_file_path, output_path):
    start = time.time()
    step = 0
    
    depths = {
        "bra": [1, 3, 5, 7, 9],
        "hs": [2, 6],
        "wld": [2, 5]
    }
    
    if not os.path.exists(output_path): os.makedirs(output_path)
    d = pd.HDFStore(os.path.join(output_path, 'secex.h5'))
    # if "ymb" in d:
    if "ymbp" in d:
        tables = {}
        tables["ymb"] = d["ymb"]; tables["ymp"] = d["ymp"]; tables["ymw"] = d["ymw"]; tables["ymbp"] = d["ymbp"]; tables["ymbw"] = d["ymbw"]; tables["ympw"] = d["ympw"]; tables["ymbpw"] = d["ymbpw"]
    else:
        step += 1; print '''\nSTEP {0}: \nImport file to pandas dataframe'''.format(step)
        secex_exports = to_df(export_file_path, False)
        secex_imports = to_df(import_file_path, False)
        # secex_exports = secex_exports.head(1000)
        # secex_imports = secex_imports.head(1000)

        step += 1; print '''\nSTEP {0}: \nMerge imports and exports'''.format(step)
        secex_df = merge(secex_exports, secex_imports)

        step += 1; print '''\nSTEP {0}: \nAggregate'''.format(step)
        ymbpw = aggregate(secex_df)

        step += 1; print '''\nSTEP {0}: \nShard'''.format(step)
        [ymb, ymbp, ymbw, ymp, ympw, ymw] = shard(ymbpw)

        step += 1; print '''\nSTEP {0}: \nCalculate PCI & ECI'''.format(step)
        [ymp, ymw] = pci_wld_eci(eci_file_path, pci_file_path, ymp, ymw, year)

        step += 1; print '''\nSTEP {0}: \nCalculate diversity'''.format(step)
        ymb = calc_diversity(ymbp, ymb, "bra_id", "hs_id")
        ymb = calc_diversity(ymbw, ymb, "bra_id", "wld_id")
        ymp = calc_diversity(ymbp, ymp, "hs_id", "bra_id")
        ymp = calc_diversity(ympw, ymp, "hs_id", "wld_id")
        ymw = calc_diversity(ymbw, ymw, "wld_id", "bra_id")
        ymw = calc_diversity(ympw, ymw, "wld_id", "hs_id")
        
        step += 1; print '''\nSTEP {0}: \nCalculate domestic ECI'''.format(step)
        ymb = domestic_eci(ymp, ymb, ymbp, depths["bra"])

        step += 1; print '''\nSTEP {0}: \nCalculate Brazilian RCA'''.format(step)
        ymp = brazil_rca(ymp, ypw_file_path, year)
    
        step += 1; print '''\nSTEP {0}: \nCalculate RCA, diversity and opp_gain aka RDO'''.format(step)
        ymbp = rdo(ymbp, ymp, year, depths["bra"], ypw_file_path)
        
        tables = {"ymb": ymb, "ymp": ymp, "ymw": ymw, "ymbp": ymbp, "ymbpw": ymbpw, "ymbw": ymbw, "ympw": ympw}
        for tbln, tbl in tables.items():
            d[tbln] = tbl

    print "computing column lengths"
    for table_name, table_data in tables.items():
        tables[table_name] = add_column_length(table_name, table_data)
    
    print '''\nFINAL STEP: \nSave files to output path'''
    for t_name, t in tables.items():
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        new_file_path = os.path.abspath(os.path.join(output_path, "{0}.tsv.bz2".format(t_name)))
        t.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True)

    total_run_time = (time.time() - start) / 60
    print; print;
    print "Total runtime: {0} minutes".format(int(total_run_time))
    print; print;