parser.add_argument("-p",
                        "--NPROC",
                        type=int,
                        default=pytheas.available_cpu_count())
    parser.add_argument("-m", "--max_lines", type=int, default=10000)
    parser.add_argument("-c",
                        "--db_cred_file",
                        default="database_credentials.json")
    args = parser.parse_args(sys.argv[1:])

    with open(args.db_cred_file) as f:
        credentials = json.load(f)

    args = parser.parse_args(sys.argv[1:])

    # Database connection credentials
    db_cred = DotMap()
    db_cred.user = credentials["user"]
    db_cred.password = credentials["password"]
    db_cred.database = credentials["ground_truth_db"]
    db_cred.opendata_database = credentials["profile_db"]
    db_cred.port = credentials["port"]

    Pytheas = pytheas.PYTHEAS()
    Pytheas.load_weights(args.weights)

    pytheas.process_endpoint(args.portals,
                             db_cred,
                             NPROC=args.NPROC,
                             max_lines=args.max_lines)
    if not os.path.exists('add_nulls'):
        os.makedirs('add_nulls')
    if not os.path.exists('add_noise'):
        os.makedirs('add_noise')

    print(f'inject_percent_null={inject_percent_null}')    
    print(f'inject_percent_outlier={inject_percent_outlier}')
    print(f'num_processors={num_processors}')
    


    k_folds = args.k_folds 
    db_cred = DotMap()
    db_cred.user = args.user
    db_cred.database = args.database
    db_cred.port = args.port
    db_cred.password = getpass(prompt=f'Please enter password for user {db_cred.user} on database {db_cred.database}:') 
    max_lines = 100


    pat_classifier = pat.PYTHEAS()
    average_results = eval_pat_line_kfold(pat_classifier,
                        k_folds,
                        db_cred,
                        top_level_dir,
                        inject_percent_null,
                        inject_percent_outlier,
                        num_processors, 
                        max_lines
                        )
Beispiel #3
0
    parser.add_argument("-p", "--port", default=5532, help="port that postgresql database listens to")
    parser.add_argument("-n", "--num_processors", default = 64, type=int, help="number of processors to be used")
    parser.add_argument("-t", "--top_level_dir", default="/home/christina/OPEN_DATA_CRAWL_2018", help="path to Open Data Crawl")
    
    args = parser.parse_args(sys.argv[1:])
    num_processors=min(args.num_processors,pytheas.available_cpu_count())
    top_level_dir = args.top_level_dir

    # Database connection credentials
    db_cred = DotMap()
    db_cred.user = args.user
    db_cred.database = args.database
    db_cred.port = args.port
    db_cred.password = ''
    
    pytheas_model = pytheas.PYTHEAS()

    pytheas_model.collect_rule_activation(db_cred, num_processors, top_level_dir)


    print('\nLoading CACHED Training Data...')

    con=connect(dbname=db_cred.database, 
                user=db_cred.user, 
                host = 'localhost', 
                password=db_cred.password, 
                port = db_cred.port)

    undersampled_cell_data = pd.read_sql_query(
            sql = f"SELECT * FROM pat_data_cell_rules WHERE undersample=True", con=con)
    con.close()