Beispiel #1
0
def deal_feature(file_in, file_out, file_data, ponzi=None):
    color.pInfo('Dealing with features')
    contracts = []

    ins = pd.read_csv(file_in,encoding='utf-8')
    outs = pd.read_csv(file_out, encoding='utf-8')

    addr_in, val_ins, time_ins = sequence(ins)
    addr_out, val_outs, time_outs = sequence(outs)
    
    ins = [[addr_in[i],val_ins[i],time_ins[i]] for i in range(len(addr_in))]
    outs = [[addr_out[i],val_outs[i],time_outs[i]] for i in range(len(addr_out))]
    
    df_in = pd.DataFrame(ins, columns=['address','val_in','time_in'])
    df_out = pd.DataFrame(outs, columns=['address','val_out','time_out'])

    # for i in range(max(ins.shape[0],outs.shape[0])):
    #     contract = [i, ponzi] if ponzi else [i]
    #     if i in val_ins.keys():
    #         contract.append(time_ins[i])
    #         contract.append(val_ins[i])
    #     else:
    #         contract.append('')
    #         contract.append('')
    #     if i in val_outs.keys():
    #         contract.append(time_outs[i])
    #         contract.append(val_outs[i])
    #     else:
    #         contract.append('')
    #         contract.append('')
    #     contracts.append(contract)
    df = pd.concat([df_in,df_out],join='outer',axis=1)
    df.to_csv(file_data, index=False)
    color.pDone('Have generated '+file_data+'.')
Beispiel #2
0
def deal_out(addr_file,in_file,to_file): 
    color.pInfo('Dealing with '+in_file)
    os.system('echo \"EOF\" >> '+in_file)

    address = readAddr(addr_file)
    transactions = []

    with open(in_file,'r',encoding='utf-8') as f:
        line = f.readline().strip()
        index = 0
        while(line!='EOF'):
            data = []
            if line == '':
                line = f.readline().strip()
                continue

            if line[0] == '(':
                index = index+1

            if line[0] =='2':
                attributes = line.split('|')
                for i in range(len(attributes)):
                    attributes[i] = attributes[i].strip()
                data = [address[index], attributes[0], attributes[1]]
                transactions.append(data)
            line = f.readline().strip()

    color.pInfo('collected '+str(len(transactions))+' transactions.')
    df = pd.DataFrame(data=transactions, columns=names_transaction)
    df.to_csv(to_file,index=False)
    color.pDone('Done')
Beispiel #3
0
def collectAddr(p, n=N, timeout=120):
    log_file = os.path.join('log', 'collect.log')
    examLog(log_file)
    last = fetchLog(log_file)
    new = int(last) + N

    # os.makedirs('test_addr')
    out_file = os.path.join('result', 'addr_' + last + '.out')
    p.sendline('\o ' + out_file)
    p.expect('#')

    query = 'SELECT address FROM code WHERE address IN \
    (SELECT to_address from external_transaction WHERE value!=\'0\' ORDER BY number DESC limit \
    ' + str(N) + ' OFFSET ' + str(last) + ') ORDER BY number DESC;'
    p.sendline(query)
    color.pInfo('Excuting query \'' + query + '\', raising TimeOut \
        exception in ' + str(timeout) + ' sec.')
    p.expect('#', timeout=timeout)
    color.pDone('Done query.')

    with open(out_file) as f:
        out = f.readlines()
    try:
        out = out[-2]
    except:
        color.pError('Failed to write the results')
        p.close()
        sys.exit(1)

    color.pDone('Collected address ' + out + '\nWritten in ' + out_file + ' .')
    writeLog(log_file, new)
Beispiel #4
0
def val_sql(addr_file, query, p):
    import pexpect
    ponzi_addr = readAddr(addr_file)
    sql = os.path.join('sql', 'val.sql')
    with open(sql, 'w') as f:
        for data in ponzi_addr:
            sentence = query[0] + data + query[1]
            f.write(sentence)

    p.sendline('\i ' + sql)
    time = 1
    index = p.expect(['#', pexpect.TIMEOUT], timeout=600)
    while (index == 1):
        color.pInfo('searched for ' + str(time) + '0 mins')
        time = time + 1
        index = p.expect(['#', pexpect.TIMEOUT], timeout=600)
Beispiel #5
0
def deal_in(addr_file, in_file, to_file): 
    color.pInfo('Dealing with '+in_file)
    os.system('echo \"EOF\" >> '+in_file)

    address = readAddr(addr_file)
    transactions = []
    block_hash = []

    with open(in_file,'r',encoding='utf-8') as f:
        index = 0
        line = f.readline().strip()
        while(line!='EOF'):
            if line == '':
                line = f.readline().strip()
                continue
    
            if line[0] == '(':
                index = index + 1

            data = []
            if line[0] =='\\':
                attributes = line.split('|')
                for i in range(len(attributes)):
                    attributes[i] = attributes[i].strip()
                try:
                    data = [address[index], '',attributes[1]]
                    block_hash.append(attributes[0])
                    transactions.append(data)
                except:
                    color.pError('out of index')
                    print('index',index)
                    print('attribute',attributes)
                    break
                    
            line = f.readline().strip()

    df = pd.DataFrame(data=transactions, columns=names_transaction)
    df.to_csv(to_file,index=False)
    color.pDone('Done')
    '''
    Cause the external transaction does not have a timestamp clumns in its table,
    record block hashes, 
    then pull timestamp of the block as the timestamp of transactions
    '''
    
    return block_hash
def extract(database):
    # database_ponzi = path.join('feature','nponzi_feature_raw.csv')
    color.pInfo("Dealing with transaction data data")

    raw_data = pd.read_csv(database)
    raw_data = raw_data.fillna(0)
    tx_features = []
    f_names = [  #'ponzi',
        'address',
        'nbr_tx_in',
        'nbr_tx_out',
        'Tot_in',
        'Tot_out',
        'mean_in',
        'mean_out',
        'sdev_in',
        'sdev_out',
        'gini_in',
        'gini_out',
        'avg_time_btw_tx',
        # 'gini_time_out',
        'lifetime',
    ]
    for i in range(raw_data.shape[0]):
        # ponzi = raw_data.iloc[i]['ponzi']
        address = raw_data.iloc[i]['address']
        time_in = raw_data.iloc[i]['time_in']
        time_out = raw_data.iloc[i]['time_out']
        val_in = raw_data.iloc[i]['val_in']
        val_out = raw_data.iloc[i]['val_out']
        if val_in != '' or val_out != '':
            #f = tl.basic_features(ponzi, time_in, time_out, val_in, val_out)
            f = tl.basic_features(None, address, time_in, time_out, val_in,
                                  val_out)
            tx_features.append(f)

    tl.compute_time(t0)

    df_features = pd.DataFrame(tx_features, columns=f_names)
    name = os.path.basename(database).split('.')[0]
    f_file = os.path.join(
        'feature',
        name.split('_')[0] + '_' + name.split('_')[1] + '_feature.csv')
    df_features.to_csv(f_file, index=None)
    color.pDone('Have written feature file ' + f_file + '.')
Beispiel #7
0
def timestamp_sql(in_hash, p):
    import pexpect
    hashes = in_hash
    sql = os.path.join('sql', 'time.sql')
    with open(sql, 'w') as f:
        for data in hashes:
            # data = data[0][1:]
            sentence = 'select timestamp from block where hash=\'' + data + '\';\r'
            f.write(sentence)
        # p.sendline(sentence)

    p.sendline('\i ' + sql)
    time = 1
    index = p.expect(['#', pexpect.TIMEOUT], timeout=600)
    while (index == 1):
        color.pInfo('searched for ' + str(time) + '0 mins')
        time = time + 1
        index = p.expect(['#', pexpect.TIMEOUT], timeout=600)
Beispiel #8
0
def collectTxnIn(p, addr, timeout=200):
    import sql_query as sq

    color.pInfo('Collecting transactions into contract')
    query_in = [
        'select block_hash,value from external_transaction where to_address=\'',
        '\' and value!=\'0\' limit 1000;',
        'select timestamp, value from internal_transaction where to_address=\'',
        '\' and value!=\'0\' limit 1000;'
    ]
    name = os.path.basename(addr).split('.')[0]

    # send value, hash command to sql process
    out_file = os.path.join('result', name + '_in.out')
    color.pInfo('Sending incoming transaction query to psql server')
    p.sendline('\o ' + out_file)
    p.expect('#')
    sq.val_sql(addr, query_in[:2], p)
    color.pDone('Have generated ' + out_file + '.')

    # send time command to sql process
    txn_file = os.path.join('result', name + '_in.csv')
    time_file = os.path.join('result', name + '_time.out')
    block_hash = deal_sql.deal_in(addr, out_file, txn_file)

    color.pInfo('Sending incoming timestamp query to psql server')
    p.sendline('\o ' + time_file)
    p.expect('#')
    sq.timestamp_sql(block_hash, p)
    color.pDone('Have generated ' + time_file + '.')

    # collect the query result into txn features
    deal_sql.deal_in_timestamp(txn_file, time_file)

    # send internal command to sql process
    out_inter_file = os.path.join('result', name + '_internal.out')
    color.pInfo('Sending incoming transaction in internal_trx to psql server')
    p.sendline('\o ' + out_inter_file)
    p.expect('#')
    sq.val_sql(addr, query_in[-2:], p)
    color.pDone('Have generated ' + out_inter_file + '.')

    # collect the query result into txn features
    txn_file_inter = os.path.join('result', name + 'inter_in.csv')
    deal_sql.deal_out(addr, out_inter_file, txn_file_inter)

    df1 = pd.read_csv(txn_file)
    df2 = pd.read_csv(txn_file_inter)
    df = df1.append(df2)
    df.sort_values(by='address')
    df.to_csv(txn_file, index=None)
    color.pImportant('incoming txn shape' + str(df.shape))

    return txn_file
Beispiel #9
0
def collectTxnIn(p, addr, timeout=200):
    import sql_query as sq

    color.pInfo('Collecting transactions into contract')
    query_in = [
        'select block_hash,value from external_transaction where to_address=\'',
        '\' and value!=\'0\';'
    ]
    name = os.path.basename(addr).split('.')[0]

    # send command to sql process
    out_file = os.path.join('result', name + '_in.out')
    color.pInfo('Sending incoming transaction query to psql server')
    p.sendline('\o ' + out_file)
    p.expect('#')
    sq.val_sql(addr, query_in, p)
    color.pDone('Have generated ' + out_file + '.')

    # send command to sql process
    txn_file = os.path.join('result', name + '_in.csv')
    time_file = os.path.join('result', name + '_time.out')
    block_hash = deal_sql.deal_in(addr, out_file, txn_file)

    color.pInfo('Sending incoming timestamp query to psql server')
    p.sendline('\o ' + time_file)
    p.expect('#')
    sq.timestamp_sql(block_hash, p)
    color.pDone('Have generated ' + time_file + '.')

    # collect the query result into txn features
    txn_file = os.path.join('result', name + '_in.csv')
    deal_sql.deal_in_timestamp(txn_file, time_file)

    return txn_file
Beispiel #10
0
def collectTxnOut(p, addr, timeout=200):
    import sql_query as sq

    color.pInfo('Collecting transactions out of contract')
    query_out = [
        'select timestamp, value from internal_transaction where from_address=\'',
        '\' and value!=\'0\' limit 1000;\r',
    ]
    name = os.path.basename(addr).split('.')[0]

    # send command to sql process
    out_file = os.path.join('result', name + '_out.out')
    color.pInfo('Sending outcoming transaction query to psql server')
    p.sendline('\o ' + out_file)
    p.expect('#')
    sq.val_sql(addr, query_out, p)
    color.pDone('Have generated ' + out_file + '.')

    # collect the query result into txn features
    txn_file = os.path.join('result', name + '_out.csv')
    deal_sql.deal_out(addr, out_file, txn_file)

    return txn_file
Beispiel #11
0
def deal_in_timestamp(txn_file, time_file):
    color.pInfo('Dealing with ' + time_file)
    os.system('echo \"EOF\" >> ' + time_file)

    transactions = pd.read_csv(txn_file, low_memory=False)
    timestamps = []
    num = 0

    with open(time_file, 'r', encoding='utf-8') as f:
        line = f.readline().strip()
        while (line != 'EOF'):
            if line == '':
                line = f.readline().strip()
                continue

            if line[0] == '2':
                timestamps.append(line)
                num = num + 1
                if num % 1000000 == 0:
                    color.pDone('dealed ' + str(num) + ' timestamps')
            line = f.readline().strip()
    color.pInfo('adding timestamps to transaction')
    j = 0
    last = transactions['address'][0]
    for i in range(transactions.shape[0]):
        if transactions['address'][i]:
            # not empty
            transactions.loc[i, 'timestamp'] = timestamps[j]
            if transactions['address'][i] != last:
                color.pInfo(transactions['address'][i] + ' transaction:' +
                            str(i))

            last = transactions['address'][i]
            j = j + 1
    color.pInfo('writing to ' + txn_file + ' .')
    transactions.to_csv(txn_file, index=False)
    color.pDone('Done')
Beispiel #12
0
def compute_time(t0):
    
    color.pInfo("computation done in " + str(time.clock() - t0) +"s")
    
    return time.clock()
Beispiel #13
0
def open_data(opcodes):

    t0 = time.clock()
    
    color.pInfo("tools.opend_data: define variables...")
    
    path = '/Users/e31989/Desktop/e31989/Documents/sm_database/'
    
    database_nml = path + 'normal.json'
    database_int = path + 'internal.json'
    database_op = path + 'opcode/opcodes_count/'
    
    database_nml_np = path + 'normal_np.json'
    database_int_np = path + 'internal_np.json'
    database_op_np = path + 'opcode_np/opcode_count/bytecode_np/'
  
    
    t1 = tl.compute_time(t0)
    
    #Open databases to access info
    
    color.pInfo("tools.open_data: open databases...")
    #ponzi instances
    with open(database_nml, 'r') as f:
        raw_nml= f.readlines()
        
    with open(database_int, 'r') as f:
        raw_int= f.readlines()
        
    op = [[f[:-5] for f in os.listdir(database_op) if f[-5:] == '.json'],[f[:-5] for f in os.listdir(database_op_np) if f[-5:] == '.json']]
    
    N = len(op[0])
    N_np = len(op[1])

    op_freq = [[],[]]
    for add in op[0]:    
        with open(database_op + add + '.json', 'r') as f:
            raw = f.readlines()
            res = [0 for i in range(len(opcodes))]
            if len(raw) > 1 :
                tot = 0
                for opcode in raw:
                    count = float(opcode[3])
                    tot += count
                    res[opcodes.index(opcode[5:-1])] = count
            else:
                tot = 1
            res = [x/tot for x in res]
            op_freq[0].append(res)
            
    #non ponzi instances
    with open(database_nml_np, 'r') as f:
        raw_nml_np= f.readlines()
        
    with open(database_int_np, 'r') as f:
        raw_int_np= f.readlines()
           
    for add in op[1]:    
        with open(database_op_np + add + '.json', 'r') as f:
            raw = f.readlines()
            res = [0 for i in range(len(opcodes))]
            if len(raw) > 1 :
                tot = 0
                for opcode in raw:
                    count = float(opcode[3])
                    tot += count
                    res[opcodes.index(opcode[5:-1])] = count
            else:
                tot = 1
            res =[x/tot for x in res]
            op_freq[1].append(res)        
    
    t2 = tl.compute_time(t1)
    
    with open(path + 'op_freq.json', 'w') as outfile:
        outfile.write(json.dumps(op_freq))
        print('op_freq serialized')
        
        #tr_dico is a list of which the size is the number of SM, each element is a list of which the size 
        #is the number of transactions, each element is a dictionnary containing data about a specific transacton.
    print("tools.open_data: create dictionnaries...")
    #ponzi instances    
    addr = [raw_nml[2*i][:-1] for i in range(len(raw_nml)//2)]
    addr_int = [raw_int[2*i][:-1] for i in range(len(raw_int)//2)]
    
    addr_np = [raw_nml_np[2*i][:-1] for i in range(len(raw_nml_np)//2)]
    addr_int_np = [raw_int_np[2*i][:-1] for i in range(len(raw_int_np)//2)]
        
    tr_dico = [
            #ponzi
            [[ast.literal_eval(raw_nml[2*addr.index(op[0][i])+1][:-1]),ast.literal_eval(raw_int[2*addr_int.index(op[0][i])+1][:-1])] for i in range(N)],
            #non ponzi
            [[ast.literal_eval(raw_nml_np[2*addr_np.index(op[1][i])+1][:-1]),ast.literal_eval(raw_int_np[2*addr_int_np.index(op[1][i])+1][:-1])] for i in range(N_np)]
            ]        
    
                
    tl.compute_time(t2)
    temp = int(N_np/3)
    
    #saved in three different files, because os.write and os.read doesn't support file with size superior to 2GB, ours is 4.2Gb.

    with open(path + 'tr_dico_nonponzi1.json','w') as f:
        f.write(json.dumps(tr_dico[1][:temp]))
    
    print('serialized half tr_dico')
        
    with open(path + 'tr_dico_nonponzi2.json','w') as f:
        f.write(json.dumps(tr_dico[1][temp:2*temp]))
   
    with open(path + 'tr_dico_nonponzi3.json','w') as f:
        f.write(json.dumps(tr_dico[1][2*temp:]))    
    print('everything has been serialized')
    
    return tr_dico
Beispiel #14
0
    out_file = os.path.join('result', name + '_out.out')
    color.pInfo('Sending outcoming transaction query to psql server')
    p.sendline('\o ' + out_file)
    p.expect('#')
    sq.val_sql(addr, query_out, p)
    color.pDone('Have generated ' + out_file + '.')

    # collect the query result into txn features
    txn_file = os.path.join('result', name + '_out.csv')
    deal_sql.deal_out(addr, out_file, txn_file)

    return txn_file


if __name__ == '__main__':
    color.pInfo('Starting with addresse file in address folder')
    color.pInfo('Usage: python code/main.py')

    psql = 'psql --host 192.168.1.2 -U gby ethereum'
    addrs = ['116ponzi+116dapp_addr.csv']
    #addrs = ['dapp1.csv','dapp2.csv','dapp3.csv','dapp4.csv']
    #addrs = ['merged_'+str(i)+'.csv' for i in range(10)]

    # collect val and time sequence from addresses
    dirPath = 'address'
    # addrs = os.listdir(dirPath)
    p = connectPSQL(psql)
    times = [time.time()]

    #for addr in addrs:
    for i in range(1):
Beispiel #15
0
    color.pDone('Have generated ' + out_file + '.')

    # collect the query result into txn features
    txn_file = os.path.join('result', name + '_out.csv')
    deal_sql.deal_out(addr, out_file, txn_file)

    return txn_file


if __name__ == '__main__':
    Round = None
    try:
        Round = sys.argv[1]
    except:
        color.pInfo(
            'Starting with collecting addresses, usage: python main.py [Round]'
        )
        color.pInfo('If you have collected addresses in test_addr, ignore it.')

    # os.makedirs('log')
    # os.makedirs('sql')

    psql = 'psql --host 192.168.1.2 -U gby ethereum'

    if Round:
        # collect addresses
        p = connectPSQL(psql)
        for i in range(int(Round)):
            color.pInfo('Collecting round ', i)
            collectAddr(p)
        p.sendline('\q')