Ejemplo n.º 1
0
def main():

    # test the performance of the engineered features
    # edges list and nodes list file
    args = ArgParser.parse_args()

    main_path = args.main_path + 'inputs/'
    pfilename = args.graph
    node_filename = main_path + 'nodes_' + pfilename + '.csv'
    edge_filename = main_path + 'edges_' + pfilename + '.csv'

    print('Read edge list and node list.')
    edges_df = pd.read_csv(edge_filename)
    nodes_df = pd.read_csv(node_filename)

    # get anchor nodes list
    print('Retrieve anchor nodes list.')
    anchor_nodes = nodes_df.loc[nodes_df['is_anchor'] == 1]['node'].tolist()

    # --- generate features for all anchor nodes of a graph ---
    print('Generate features for the anchor nodes.')
    n_eng_features = NodeEngFeatures(nodes_df, edges_df)
    node_feature_df = n_eng_features.gen_node_features_list(anchor_nodes)

    # save anchor nodes features to file
    print(node_feature_df.head())
    print('Save node features dataframe.')
    node_features_filename = main_path + 'features_eth_0.csv'
    node_feature_df.to_csv(node_features_filename, index=False)
Ejemplo n.º 2
0
def runDielec(arg):
    """
    Runs UHBD calculation with a set of dielectric constants in dielc_file.
    """

    # Parse the command line arguments
    dielec = ArgParser.modeClass("dielec",["pdb_file","dielec_file"],
        ["inpfile","inpfile"],["ionic_strength","pHtitr","out_dir"])
    dielec.parseArg(arg)

    filename = dielec.required["pdb_file"]
    dielec_file = dielec.required["dielec_file"]
    output_path = dielec.optional.out_dir
    pH_start = dielec.optional.pHtitr[0]
    pH_stop = dielec.optional.pHtitr[1]
    pH_interval = dielec.optional.pHtitr[2]
    salt = dielec.optional.ionic_strength

    f = open(dielec_file)
    dielectric_constants = [float(d) for d in f.readlines()]
    f.close()

    base_path = os.path.join(__init__.invocation_path,output_path)

    for dielectric in dielectric_constants:
        print "dielectric constant: %4.2F" % dielectric 
        
        dielec_path = os.path.join(base_path,"D%.1F" % dielectric)
        FileOps.makeDir(dielec_path)

        dielec_output = os.path.join(dielec_path,"%.1F" % salt)
        FileOps.makeDir(dielec_output)
        
        indivRun(filename,dielec_output,pH_start,pH_stop,pH_interval,salt,
                 dielectric)
Ejemplo n.º 3
0
def main():
    """
    end-to-end classification
    """
    binary = True  # binary or multi-class classification.

    args = ArgParser.parse_args()
    input_path = args.input
    graph_filename = args.graph
    prod_data_dir = args.output

    flag = args.flag
    clf_opt = args.clf_opt
    exp_id = args.exp_id

    # data_path = main_path + 'inputs/'
    # prod_data_dir = main_path + 'outputs/'
    stats_file = prod_data_dir + 'stats_' + flag + '.csv'

    edges_filename = input_path + 'edges_' + graph_filename + '.csv'
    nodes_filename = input_path + 'nodes_' + graph_filename + '.csv'
    features_filename = prod_data_dir + 'features_' + graph_filename + '.csv'

    feat_imp_filename = prod_data_dir + 'feature_importance_' + graph_filename + '.csv'

    if clf_opt == 'fe':
        # ------------------ Feature Engineering ------------------
        # read the input file and generating the features and the labels set
        print("Node Classification --- Feature Engineering ---")

        EF_analysis_selected_nodes(prod_data_dir, graph_filename, edges_filename, nodes_filename,
                                   features_filename, stats_file, feat_imp_filename, 'FE', binary,
                                   rnd_seed, exp_id, extra_analysis=False)
        print("--- Node Classification Feature Engineering is done ---")
        # ---------------------------------------------------------

    elif clf_opt == 'concat':
        print("Node classification: Concat. FE &" + flag + " embeddings.")

        emb_file = prod_data_dir + 'emb_' + str(flag) + '_' + graph_filename + '.emb'
        fe_file = features_filename
        nd_clf_fe_emb_combined(emb_file, fe_file, stats_file, flag, binary, exp_id)

    else:
        # ------------------ RiWalk -------------------------------
        print("Node classification: --- RiWalk - " + flag + "---")

        # set file names
        emb_filename = prod_data_dir + 'emb_' + str(flag) + '_' + graph_filename + '.emb'

        RiWalk_analysis_selected_nodes(prod_data_dir, graph_filename, emb_filename, nodes_filename, stats_file,
                                       flag, binary, exp_id, extra_analysis=False)
        print("--- Classification RiWalk is done ---")
Ejemplo n.º 4
0
def runSingle(arg):
    """
    Runs UHBD calculation on a single file.
    """
    
    single = ArgParser.modeClass("single",["pdb_file"],["inpfile"],
        ["dielectric", "ionic_strength","pHtitr","out_dir"])
    single.parseArg(arg)

    indivRun(single.required["pdb_file"],single.optional.out_dir,
        single.optional.pHtitr[0],single.optional.pHtitr[1],
        single.optional.pHtitr[2],single.optional.ionic_strength,
        single.optional.dielectric)
Ejemplo n.º 5
0
def runSalts(arg):
    """
    Runs UHBD calculation on set of salts in salts_file.
    """

    # Parse the command line arguments
    salts = ArgParser.modeClass("salts",["pdb_file","salt_file"],
        ["inpfile","inpfile"],["dielectric","pHtitr","out_dir"])
    salts.parseArg(arg)
    
    filename = salts.required["pdb_file"]
    salts_file = salts.required["salt_file"]
    output_path = salts.optional.out_dir
    pH_start = salts.optional.pHtitr[0]
    pH_stop = salts.optional.pHtitr[1]
    pH_interval = salts.optional.pHtitr[2]
    dielectric = salts.optional.dielectric 

    # Execute the indivRun function for every salt in salts_file
    f = open(salts_file)
    salts = [float(salt) for salt in f.readlines()]
    f.close()
    
    for salt in salts:
        print "Ionic strength: %4.2F" % salt
        salt_output = os.path.join(output_path,"%.1F" % salt)
        
        # Create output directory
        try:
            os.mkdir(os.path.join(__init__.invocation_path,salt_output))
        except OSError, value:
            # Don't stop if we are only overwriting existing directory
            if value[0] != 17: 
                print 'File error.'
                print value[0], os.path.join(__init__.invocation_path,
                                             salt_output), value[1]
                sys.exit()

        # Run UHBD at this specific salt concentration
        indivRun(filename,salt_output,pH_start,pH_stop,pH_interval,salt,
                 dielectric)
Ejemplo n.º 6
0
def runDielec(arg):
    """
    Runs UHBD calculation with a set of dielectric constants in dielc_file.
    """

    # Parse the command line arguments
    dielec = ArgParser.modeClass("dielec",["pdb_file","dielec_file"],
        ["inpfile","inpfile"],["ionic_strength","pHtitr","out_dir"])
    dielec.parseArg(arg)

    filename = dielec.required["pdb_file"]
    dielec_file = dielec.required["dielec_file"]
    output_path = dielec.optional.out_dir
    pH_start = dielec.optional.pHtitr[0]
    pH_stop = dielec.optional.pHtitr[1]
    pH_interval = dielec.optional.pHtitr[2]
    salt = dielec.optional.ionic_strength

    f = open(dielec_file)
    dielectric_constants = [float(d) for d in f.readlines()]
    f.close()
    
    for dielectric in dielectric_constants:
        print "dielectric constant: %4.2F" % dielectric 
        dielec_output = os.path.join(output_path,"%.1F" % dielectric)
        
        # Create output directory
        try:
            os.mkdir(os.path.join(__init__.invocation_path,dielec_output))
        except OSError, value:
            # Don't stop if we are only overwriting existing directory
            if value[0] != 17: 
                print 'File error.'
                print value[0], os.path.join(__init__.invocation_path,
                                             dielec_output), value[1]
                sys.exit()
    
        indivRun(filename,dielec_output,pH_start,pH_stop,pH_interval,salt,
                 dielectric)
Ejemplo n.º 7
0
def runSalts(arg):
    """
    Runs UHBD calculation on set of salts in salts_file.
    """

    # Parse the command line arguments
    salts = ArgParser.modeClass("salts",["pdb_file","salt_file"],
        ["inpfile","inpfile"],["dielectric","pHtitr","out_dir"])
    salts.parseArg(arg)
    
    filename = salts.required["pdb_file"]
    salts_file = salts.required["salt_file"]
    output_path = salts.optional.out_dir
    pH_start = salts.optional.pHtitr[0]
    pH_stop = salts.optional.pHtitr[1]
    pH_interval = salts.optional.pHtitr[2]
    dielectric = salts.optional.dielectric 

    # Execute the indivRun function for every salt in salts_file
    f = open(salts_file)
    salts = [float(salt) for salt in f.readlines()]
    f.close()

    base_path = os.path.join(__init__.invocation_path,output_path)    
    dielec_path = os.path.join(base_path,"D%.1F" % dielectric)
    FileOps.makeDir(dielec_path)
    
    for salt in salts:
        print "Ionic strength: %4.2F" % salt
        salt_output = os.path.join(dielec_path,"%.1F" % salt)
        
        # Create output directory
        FileOps.makeDir(salt_output)

        # Run UHBD at this specific salt concentration
        indivRun(filename,salt_output,pH_start,pH_stop,pH_interval,salt,
                 dielectric)
Ejemplo n.º 8
0
def main():
    """
        Experiments
    """
    # retrieve arguments
    args = ArgParser.parse_args()
    subgraph_path = args.input
    graph_name = args.graph
    prod_data_path = args.output
    gprep_opt = args.gprep_opt

    # input file; directly from XBlock
    node_filename = subgraph_path + 'nodes_' + graph_name + '.csv'
    edge_filename = subgraph_path + 'edges_' + graph_name + '.csv'

    # generated file
    nodelist_RiWalk_filename = prod_data_path + 'nodes_' + graph_name + '.nodelist'
    edgelist_RiWalk_filename = prod_data_path + 'edges_' + graph_name + '.edgelist'
    features_filename = prod_data_path + 'features_' + graph_name + '.csv'
    feat_imp_filename = prod_data_path + 'feature_importance_' + graph_name + '.csv'

    # elliptic files
    elliptic_classes_filename = subgraph_path + '_elliptic_txs_classes.csv'
    elliptic_feature_filename = subgraph_path + '_elliptic_txs_features.csv'

    nodes = pd.read_csv(node_filename)
    edges = pd.read_csv(edge_filename)

    # read elliptic dataset files
    elliptic_classes = pd.read_csv(elliptic_classes_filename)
    elliptic_features = pd.read_csv(elliptic_feature_filename)
    tx_features = ["tx_feat_" + str(i) for i in range(2, 95)]
    agg_features = ["agg_feat_" + str(i) for i in range(1, 73)]
    elliptic_features.columns = ["address", "timestamp"
                                 ] + tx_features + agg_features
    elliptic_features = pd.merge(elliptic_features,
                                 elliptic_classes,
                                 left_on="address",
                                 right_on="txId",
                                 how='left')
    elliptic_features['label'] = elliptic_features['label'].apply(
        lambda x: '0' if x == "unknown" else x)

    # instantiate a feature engineering object
    bc_fe = bfe.BitcoinNodeEngFeatures(nodes, edges)

    # graph preparation tasks
    if gprep_opt == 'feature':
        generate_features_for_all_nodes(bc_fe, elliptic_features,
                                        prod_data_path, graph_name)
    elif gprep_opt == 'ri':
        # generate edge list for RiWalk
        generate_edgelist_for_RiWalk(bc_fe, edgelist_RiWalk_filename)
        # generate node list for RiWalk
        nodes_all_features_df = pd.read_csv(features_filename)
        feature_rank = pd.read_csv(feat_imp_filename)['feature'].tolist()
        # selected_features = ['node'] + feature_rank[0:10]
        selected_features = ['node'] + feature_rank  # all the features
        generate_node_list_for_RiWalk(nodes_all_features_df, selected_features,
                                      nodelist_RiWalk_filename)
    else:
        raise ValueError("Incorrect value for graph preparation option!")
Ejemplo n.º 9
0
def main():
    """
    Experiments
    """
    # path setting
    binary = True  # binary or multi-class classification.ATTENTION: always use BINARY classification

    # retrieve arguments from argument parser
    args = ArgParser.parse_args()
    subgraph_path = args.input
    graph_filename = args.graph
    prod_data_path = args.output

    flag = args.flag
    clf_opt = args.clf_opt
    exp_id = args.exp_id

    stats_file = prod_data_path + 'stats_' + flag + '.csv'

    edges_filename = subgraph_path + 'edges_' + graph_filename + '.csv'
    nodes_filename = subgraph_path + 'nodes_' + graph_filename + '.csv'
    features_filename = prod_data_path + 'features_' + graph_filename + '.csv'

    feat_imp_filename = prod_data_path + 'feature_importance_' + graph_filename + '.csv'

    if clf_opt == 'fe':
        # ------------------ Feature Engineering ------------------
        # read the input file and generating the features and the labels set
        print("Node Classification --- Feature Engineering ---")

        Bitcoin_EF_analysis_selected_nodes(prod_data_path,
                                           graph_filename,
                                           features_filename,
                                           stats_file,
                                           feat_imp_filename,
                                           'FE',
                                           binary,
                                           rnd_seed,
                                           exp_id,
                                           extra_analysis=False)
        print("--- Node Classification Feature Engineering is done ---")
        # ---------------------------------------------------------

    elif clf_opt == 'concat':
        # ------------------ FE & Emb concatenating ------------------
        print("Node classification: Concat. FE &" + flag + " embeddings.")

        emb_file = prod_data_path + 'emb_' + str(
            flag) + '_' + graph_filename + '.emb'
        fe_file = features_filename
        bitcoin_nd_clf_fe_emb_combined(emb_file, fe_file, stats_file, flag,
                                       binary, exp_id)
        # ---------------------------------------------------------

    else:
        # ------------------ RiWalk -------------------------------
        print("Node classification: --- RiWalk - " + flag + "---")

        # set file names
        emb_filename = prod_data_path + 'emb_' + str(
            flag) + '_' + graph_filename + '.emb'

        nd.RiWalk_analysis_selected_nodes(prod_data_path,
                                          graph_filename,
                                          emb_filename,
                                          nodes_filename,
                                          stats_file,
                                          flag,
                                          binary,
                                          exp_id,
                                          extra_analysis=False)
        print("--- Classification RiWalk is done ---")
Ejemplo n.º 10
0
def main(args=None):

    config = configp.start()
    if args is None:
        args = parser.start()
    elif isinstance(args,basestring):
        args = parser.start(args)
    
    cedfile = args.ced
    stdfile = args.std
    plotFields = args.field 
    multi = args.multi
    print_shapes = args.print_shapes
    print_global_atts = args.print_global_atts
    print_axis = args.print_axis
    print_list_synth = args.print_list_synth
    terrain = args.terrain
    slope = args.slope
    meteo = args.meteo
    valid = args.valid
    prof = args.prof
    nearest = args.nearest
    noplot = args.no_plot

    """ print synhesis availables """
    if print_list_synth:
        synth_folder=fh.set_working_files(config=config)
        out=os.listdir(synth_folder)
        print "Synthesis directories available:"
        for f in out:
            print f
        usr_input = raw_input('\nIndicate directory: ')
        out=os.listdir(synth_folder+'/'+usr_input)
        print "\nSyntheses available:"
        out.sort()
        for f in out:
            if f[-3:]=='cdf': print f
        print '\n'
        sys.exit()


    """ retrieves synthesis and flight instances
        from AircraftAnalysis
    """
    SYNTH, FLIGHT, TERRAIN = fh.set_working_files(cedfile=cedfile,
                                                stdfile=stdfile,
                                                config=config)
    
    """ print shape of attribute arrays """
    if print_shapes:
        SYNTH.print_shapes()
        if not print_global_atts: 
            sys.exit()

    """ print global attirutes of cedric synthesis """
    if print_global_atts:
        SYNTH.print_global_atts()
        sys.exit()

    """ print axis values """
    if print_axis:
        for ax in print_axis:
            if ax.isupper():
                ax=ax.lower()
            SYNTH.print_axis(ax)
        sys.exit()

    """ print synthesis time """
#    print "Synthesis start time :%s" % SYNTH.start
#    print "Synthesis end time :%s\n" % SYNTH.end

    """ make synthesis plots """
    if plotFields:
        for f in plotFields:
            P=Plotter.plot_synth(SYNTH,FLIGHT,TERRAIN,
                                var=f,
                                wind=args.wind,
                                panel=args.panel,
                                slicem = args.slicem,
                                slicez = args.slicez,
                                slice = args.slice,
#                                azimuth = args.azimuth,
#                                distance = args.distance,
                                zoomIn=args.zoomin,
                                mask = args.mask,
                                config=config)

    """ make terrain plots """
    if terrain or slope:
        # P[0] might produce error if P is not a list, 
        # check in ploth_synth.cross_section
        Plotter.plot_terrain(P[0],
                             terrain=terrain,
                             slope=slope,
                             terrain_file=config['filepath_dtm'])

    """ make flight level meteo plot """
    if meteo:
        Plotter.plot_flight_meteo(SYNTH,FLIGHT)

    """ compare synth and flight level """
    if valid:
        out = Plotter.compare_synth_flight(SYNTH,FLIGHT,
                                     level=valid,
                                     zoomin=args.zoomin,
                                     noplot=noplot)
        
#        if 'wind_profiler' in config:
#            case=int(cedfile[1:3])
#            Plotter.compare_with_windprof(SYNTH,
#                                          location=config['wind_profiler'],
#                                          case=case)
        return out
        
    """ make profile from synthesis """
    if prof:
        if nearest is None:
            markers=['o','s','D','*']
            out = Plotter.make_synth_profile(SYNTH,coords=prof,
                                             markers=markers,
                                             noplot=noplot)
        else:
            ' (4.5km,n=12) or (7.0km,n=30) seem good choices '
            out = Plotter.make_synth_profile_withnearest(SYNTH,
                                                         target_latlon=prof,
                                                         max_dist=nearest[0][0], # [km]
                                                         n_neigh =nearest[0][1])        
        
        try:        
            ' if P exists '
            for i,p in enumerate(prof):
                lat,lon = p
                P.haxis.scatter(lon,lat,
                                s=config['sounding_size'],
                                c=config['sounding_color'],
                                marker=config['sounding_marker'],
                                lw=3)
        except UnboundLocalError:
            ' if P does not exist just pass '
            pass
        return out
        
        
    # if turbulence:
    # Plotter.print_covariance(SYNTH,FLIGHT)
    # Plotter.print_correlation(SYNTH,FLIGHT)
    # Plotter.plot_wind_comp_var(SYNTH,FLIGHT)
    # Plotter.plot_tke(SYNTH,FLIGHT)
    # Plotter.plot_vertical_heat_flux(SYNTH,FLIGHT)
    # Plotter.plot_vertical_momentum_flux(SYNTH,FLIGHT,config['filepath_dtm'])
    # Plotter.plot_turbulence_spectra(SYNTH,FLIGHT)

    if multi:
        plt.close('all')
        return P
    else:        
        ''' use this one with ipython '''
        plt.show(block=False)    
        ''' use this one with the shell '''
Ejemplo n.º 11
0
        
        
    # if turbulence:
    # Plotter.print_covariance(SYNTH,FLIGHT)
    # Plotter.print_correlation(SYNTH,FLIGHT)
    # Plotter.plot_wind_comp_var(SYNTH,FLIGHT)
    # Plotter.plot_tke(SYNTH,FLIGHT)
    # Plotter.plot_vertical_heat_flux(SYNTH,FLIGHT)
    # Plotter.plot_vertical_momentum_flux(SYNTH,FLIGHT,config['filepath_dtm'])
    # Plotter.plot_turbulence_spectra(SYNTH,FLIGHT)

    if multi:
        plt.close('all')
        return P
    else:        
        ''' use this one with ipython '''
        plt.show(block=False)    
        ''' use this one with the shell '''
        # plt.show()




"""call main function """
if __name__ == "__main__":

    args = parser.start()
    p = main(args)


Ejemplo n.º 12
0
import ArgParser
import sys

Parser = ArgParser.ArgParser(sys.argv)

print(Parser.ParsedArgs)
Ejemplo n.º 13
0
def main():
    """
    instantiate a node2vec object
    """
    print("Node2Vec main method.")
    start_time = time.time()

    args = ArgParser.parse_args()

    main_path = args.main_path
    graph_name = args.graph

    iter_num = args.iter
    num_walks = args.num_walks
    dim = args.dimensions
    walk_length = args.walk_length
    workers = args.workers
    window_size = args.window_size
    p = args.p
    q = args.q

    data_path = main_path + '/inputs/'
    edge_list_file_name = data_path + graph_name + '.edgelist'
    node_list_file_name = data_path + 'nodes_' + graph_name + '.csv'
    output_path = main_path + '/outputs/'
    stats_file = output_path + 'stats.csv'

    nx_g = nx.from_pandas_edgelist(pd.read_csv(edge_list_file_name),
                                   source='source',
                                   target='target',
                                   create_using=nx.DiGraph())
    print("Graph info:", nx.info(nx_g))
    # for edge in nx_g.edges():
    #     nx_g[edge[0]][edge[1]]['weight'] = 1
    # nx_g = nx_g.to_undirected()

    print("\tInstantiate a node2vec object.")
    node2vec = Node2Vec(nx_g,
                        dimensions=dim,
                        walk_length=walk_length,
                        num_walks=num_walks,
                        workers=workers,
                        p=p,
                        q=q)
    print("\tFit node2vec.")
    model = node2vec.fit(window=window_size,
                         sg=1,
                         hs=0,
                         min_count=1,
                         iter=iter_num)

    # read node list
    print("\tExtract embeddings and labels for the anchor nodes.")
    nodes_df = pd.read_csv(node_list_file_name)

    # binary classification anchor nodes
    anchor_nodes_df = nodes_df.loc[nodes_df['is_anchor'] == 1, ['node', 'isp']]
    node_list = [str(node_id) for node_id in anchor_nodes_df['node'].tolist()]
    embeddings = [model.wv.get_vector(node) for node in node_list]
    labels = anchor_nodes_df['isp'].tolist()

    # classification
    print("\tApply classification.")
    rnd_seed = 42
    binary = True
    X_train, X_test, y_train, y_test = NodeClassification.train_test_split(
        embeddings, labels, rnd_seed)
    NodeClassification.rf_lr_classification(X_train,
                                            X_test,
                                            y_train,
                                            y_test,
                                            stats_file,
                                            'n2v',
                                            binary=binary,
                                            print_report=True)
    print("Total elapsed time:", str(time.time() - start_time))