def deal_feature(file_in, file_out, file_data, ponzi=None): color.pInfo('Dealing with features') contracts = [] ins = pd.read_csv(file_in,encoding='utf-8') outs = pd.read_csv(file_out, encoding='utf-8') addr_in, val_ins, time_ins = sequence(ins) addr_out, val_outs, time_outs = sequence(outs) ins = [[addr_in[i],val_ins[i],time_ins[i]] for i in range(len(addr_in))] outs = [[addr_out[i],val_outs[i],time_outs[i]] for i in range(len(addr_out))] df_in = pd.DataFrame(ins, columns=['address','val_in','time_in']) df_out = pd.DataFrame(outs, columns=['address','val_out','time_out']) # for i in range(max(ins.shape[0],outs.shape[0])): # contract = [i, ponzi] if ponzi else [i] # if i in val_ins.keys(): # contract.append(time_ins[i]) # contract.append(val_ins[i]) # else: # contract.append('') # contract.append('') # if i in val_outs.keys(): # contract.append(time_outs[i]) # contract.append(val_outs[i]) # else: # contract.append('') # contract.append('') # contracts.append(contract) df = pd.concat([df_in,df_out],join='outer',axis=1) df.to_csv(file_data, index=False) color.pDone('Have generated '+file_data+'.')
def deal_out(addr_file,in_file,to_file): color.pInfo('Dealing with '+in_file) os.system('echo \"EOF\" >> '+in_file) address = readAddr(addr_file) transactions = [] with open(in_file,'r',encoding='utf-8') as f: line = f.readline().strip() index = 0 while(line!='EOF'): data = [] if line == '': line = f.readline().strip() continue if line[0] == '(': index = index+1 if line[0] =='2': attributes = line.split('|') for i in range(len(attributes)): attributes[i] = attributes[i].strip() data = [address[index], attributes[0], attributes[1]] transactions.append(data) line = f.readline().strip() color.pInfo('collected '+str(len(transactions))+' transactions.') df = pd.DataFrame(data=transactions, columns=names_transaction) df.to_csv(to_file,index=False) color.pDone('Done')
def collectAddr(p, n=N, timeout=120): log_file = os.path.join('log', 'collect.log') examLog(log_file) last = fetchLog(log_file) new = int(last) + N # os.makedirs('test_addr') out_file = os.path.join('result', 'addr_' + last + '.out') p.sendline('\o ' + out_file) p.expect('#') query = 'SELECT address FROM code WHERE address IN \ (SELECT to_address from external_transaction WHERE value!=\'0\' ORDER BY number DESC limit \ ' + str(N) + ' OFFSET ' + str(last) + ') ORDER BY number DESC;' p.sendline(query) color.pInfo('Excuting query \'' + query + '\', raising TimeOut \ exception in ' + str(timeout) + ' sec.') p.expect('#', timeout=timeout) color.pDone('Done query.') with open(out_file) as f: out = f.readlines() try: out = out[-2] except: color.pError('Failed to write the results') p.close() sys.exit(1) color.pDone('Collected address ' + out + '\nWritten in ' + out_file + ' .') writeLog(log_file, new)
def val_sql(addr_file, query, p): import pexpect ponzi_addr = readAddr(addr_file) sql = os.path.join('sql', 'val.sql') with open(sql, 'w') as f: for data in ponzi_addr: sentence = query[0] + data + query[1] f.write(sentence) p.sendline('\i ' + sql) time = 1 index = p.expect(['#', pexpect.TIMEOUT], timeout=600) while (index == 1): color.pInfo('searched for ' + str(time) + '0 mins') time = time + 1 index = p.expect(['#', pexpect.TIMEOUT], timeout=600)
def deal_in(addr_file, in_file, to_file): color.pInfo('Dealing with '+in_file) os.system('echo \"EOF\" >> '+in_file) address = readAddr(addr_file) transactions = [] block_hash = [] with open(in_file,'r',encoding='utf-8') as f: index = 0 line = f.readline().strip() while(line!='EOF'): if line == '': line = f.readline().strip() continue if line[0] == '(': index = index + 1 data = [] if line[0] =='\\': attributes = line.split('|') for i in range(len(attributes)): attributes[i] = attributes[i].strip() try: data = [address[index], '',attributes[1]] block_hash.append(attributes[0]) transactions.append(data) except: color.pError('out of index') print('index',index) print('attribute',attributes) break line = f.readline().strip() df = pd.DataFrame(data=transactions, columns=names_transaction) df.to_csv(to_file,index=False) color.pDone('Done') ''' Cause the external transaction does not have a timestamp clumns in its table, record block hashes, then pull timestamp of the block as the timestamp of transactions ''' return block_hash
def extract(database): # database_ponzi = path.join('feature','nponzi_feature_raw.csv') color.pInfo("Dealing with transaction data data") raw_data = pd.read_csv(database) raw_data = raw_data.fillna(0) tx_features = [] f_names = [ #'ponzi', 'address', 'nbr_tx_in', 'nbr_tx_out', 'Tot_in', 'Tot_out', 'mean_in', 'mean_out', 'sdev_in', 'sdev_out', 'gini_in', 'gini_out', 'avg_time_btw_tx', # 'gini_time_out', 'lifetime', ] for i in range(raw_data.shape[0]): # ponzi = raw_data.iloc[i]['ponzi'] address = raw_data.iloc[i]['address'] time_in = raw_data.iloc[i]['time_in'] time_out = raw_data.iloc[i]['time_out'] val_in = raw_data.iloc[i]['val_in'] val_out = raw_data.iloc[i]['val_out'] if val_in != '' or val_out != '': #f = tl.basic_features(ponzi, time_in, time_out, val_in, val_out) f = tl.basic_features(None, address, time_in, time_out, val_in, val_out) tx_features.append(f) tl.compute_time(t0) df_features = pd.DataFrame(tx_features, columns=f_names) name = os.path.basename(database).split('.')[0] f_file = os.path.join( 'feature', name.split('_')[0] + '_' + name.split('_')[1] + '_feature.csv') df_features.to_csv(f_file, index=None) color.pDone('Have written feature file ' + f_file + '.')
def timestamp_sql(in_hash, p): import pexpect hashes = in_hash sql = os.path.join('sql', 'time.sql') with open(sql, 'w') as f: for data in hashes: # data = data[0][1:] sentence = 'select timestamp from block where hash=\'' + data + '\';\r' f.write(sentence) # p.sendline(sentence) p.sendline('\i ' + sql) time = 1 index = p.expect(['#', pexpect.TIMEOUT], timeout=600) while (index == 1): color.pInfo('searched for ' + str(time) + '0 mins') time = time + 1 index = p.expect(['#', pexpect.TIMEOUT], timeout=600)
def collectTxnIn(p, addr, timeout=200): import sql_query as sq color.pInfo('Collecting transactions into contract') query_in = [ 'select block_hash,value from external_transaction where to_address=\'', '\' and value!=\'0\' limit 1000;', 'select timestamp, value from internal_transaction where to_address=\'', '\' and value!=\'0\' limit 1000;' ] name = os.path.basename(addr).split('.')[0] # send value, hash command to sql process out_file = os.path.join('result', name + '_in.out') color.pInfo('Sending incoming transaction query to psql server') p.sendline('\o ' + out_file) p.expect('#') sq.val_sql(addr, query_in[:2], p) color.pDone('Have generated ' + out_file + '.') # send time command to sql process txn_file = os.path.join('result', name + '_in.csv') time_file = os.path.join('result', name + '_time.out') block_hash = deal_sql.deal_in(addr, out_file, txn_file) color.pInfo('Sending incoming timestamp query to psql server') p.sendline('\o ' + time_file) p.expect('#') sq.timestamp_sql(block_hash, p) color.pDone('Have generated ' + time_file + '.') # collect the query result into txn features deal_sql.deal_in_timestamp(txn_file, time_file) # send internal command to sql process out_inter_file = os.path.join('result', name + '_internal.out') color.pInfo('Sending incoming transaction in internal_trx to psql server') p.sendline('\o ' + out_inter_file) p.expect('#') sq.val_sql(addr, query_in[-2:], p) color.pDone('Have generated ' + out_inter_file + '.') # collect the query result into txn features txn_file_inter = os.path.join('result', name + 'inter_in.csv') deal_sql.deal_out(addr, out_inter_file, txn_file_inter) df1 = pd.read_csv(txn_file) df2 = pd.read_csv(txn_file_inter) df = df1.append(df2) df.sort_values(by='address') df.to_csv(txn_file, index=None) color.pImportant('incoming txn shape' + str(df.shape)) return txn_file
def collectTxnIn(p, addr, timeout=200): import sql_query as sq color.pInfo('Collecting transactions into contract') query_in = [ 'select block_hash,value from external_transaction where to_address=\'', '\' and value!=\'0\';' ] name = os.path.basename(addr).split('.')[0] # send command to sql process out_file = os.path.join('result', name + '_in.out') color.pInfo('Sending incoming transaction query to psql server') p.sendline('\o ' + out_file) p.expect('#') sq.val_sql(addr, query_in, p) color.pDone('Have generated ' + out_file + '.') # send command to sql process txn_file = os.path.join('result', name + '_in.csv') time_file = os.path.join('result', name + '_time.out') block_hash = deal_sql.deal_in(addr, out_file, txn_file) color.pInfo('Sending incoming timestamp query to psql server') p.sendline('\o ' + time_file) p.expect('#') sq.timestamp_sql(block_hash, p) color.pDone('Have generated ' + time_file + '.') # collect the query result into txn features txn_file = os.path.join('result', name + '_in.csv') deal_sql.deal_in_timestamp(txn_file, time_file) return txn_file
def collectTxnOut(p, addr, timeout=200): import sql_query as sq color.pInfo('Collecting transactions out of contract') query_out = [ 'select timestamp, value from internal_transaction where from_address=\'', '\' and value!=\'0\' limit 1000;\r', ] name = os.path.basename(addr).split('.')[0] # send command to sql process out_file = os.path.join('result', name + '_out.out') color.pInfo('Sending outcoming transaction query to psql server') p.sendline('\o ' + out_file) p.expect('#') sq.val_sql(addr, query_out, p) color.pDone('Have generated ' + out_file + '.') # collect the query result into txn features txn_file = os.path.join('result', name + '_out.csv') deal_sql.deal_out(addr, out_file, txn_file) return txn_file
def deal_in_timestamp(txn_file, time_file): color.pInfo('Dealing with ' + time_file) os.system('echo \"EOF\" >> ' + time_file) transactions = pd.read_csv(txn_file, low_memory=False) timestamps = [] num = 0 with open(time_file, 'r', encoding='utf-8') as f: line = f.readline().strip() while (line != 'EOF'): if line == '': line = f.readline().strip() continue if line[0] == '2': timestamps.append(line) num = num + 1 if num % 1000000 == 0: color.pDone('dealed ' + str(num) + ' timestamps') line = f.readline().strip() color.pInfo('adding timestamps to transaction') j = 0 last = transactions['address'][0] for i in range(transactions.shape[0]): if transactions['address'][i]: # not empty transactions.loc[i, 'timestamp'] = timestamps[j] if transactions['address'][i] != last: color.pInfo(transactions['address'][i] + ' transaction:' + str(i)) last = transactions['address'][i] j = j + 1 color.pInfo('writing to ' + txn_file + ' .') transactions.to_csv(txn_file, index=False) color.pDone('Done')
def compute_time(t0): color.pInfo("computation done in " + str(time.clock() - t0) +"s") return time.clock()
def open_data(opcodes): t0 = time.clock() color.pInfo("tools.opend_data: define variables...") path = '/Users/e31989/Desktop/e31989/Documents/sm_database/' database_nml = path + 'normal.json' database_int = path + 'internal.json' database_op = path + 'opcode/opcodes_count/' database_nml_np = path + 'normal_np.json' database_int_np = path + 'internal_np.json' database_op_np = path + 'opcode_np/opcode_count/bytecode_np/' t1 = tl.compute_time(t0) #Open databases to access info color.pInfo("tools.open_data: open databases...") #ponzi instances with open(database_nml, 'r') as f: raw_nml= f.readlines() with open(database_int, 'r') as f: raw_int= f.readlines() op = [[f[:-5] for f in os.listdir(database_op) if f[-5:] == '.json'],[f[:-5] for f in os.listdir(database_op_np) if f[-5:] == '.json']] N = len(op[0]) N_np = len(op[1]) op_freq = [[],[]] for add in op[0]: with open(database_op + add + '.json', 'r') as f: raw = f.readlines() res = [0 for i in range(len(opcodes))] if len(raw) > 1 : tot = 0 for opcode in raw: count = float(opcode[3]) tot += count res[opcodes.index(opcode[5:-1])] = count else: tot = 1 res = [x/tot for x in res] op_freq[0].append(res) #non ponzi instances with open(database_nml_np, 'r') as f: raw_nml_np= f.readlines() with open(database_int_np, 'r') as f: raw_int_np= f.readlines() for add in op[1]: with open(database_op_np + add + '.json', 'r') as f: raw = f.readlines() res = [0 for i in range(len(opcodes))] if len(raw) > 1 : tot = 0 for opcode in raw: count = float(opcode[3]) tot += count res[opcodes.index(opcode[5:-1])] = count else: tot = 1 res =[x/tot for x in res] op_freq[1].append(res) t2 = tl.compute_time(t1) with open(path + 'op_freq.json', 'w') as outfile: outfile.write(json.dumps(op_freq)) print('op_freq serialized') #tr_dico is a list of which the size is the number of SM, each element is a list of which the size #is the number of transactions, each element is a dictionnary containing data about a specific transacton. print("tools.open_data: create dictionnaries...") #ponzi instances addr = [raw_nml[2*i][:-1] for i in range(len(raw_nml)//2)] addr_int = [raw_int[2*i][:-1] for i in range(len(raw_int)//2)] addr_np = [raw_nml_np[2*i][:-1] for i in range(len(raw_nml_np)//2)] addr_int_np = [raw_int_np[2*i][:-1] for i in range(len(raw_int_np)//2)] tr_dico = [ #ponzi [[ast.literal_eval(raw_nml[2*addr.index(op[0][i])+1][:-1]),ast.literal_eval(raw_int[2*addr_int.index(op[0][i])+1][:-1])] for i in range(N)], #non ponzi [[ast.literal_eval(raw_nml_np[2*addr_np.index(op[1][i])+1][:-1]),ast.literal_eval(raw_int_np[2*addr_int_np.index(op[1][i])+1][:-1])] for i in range(N_np)] ] tl.compute_time(t2) temp = int(N_np/3) #saved in three different files, because os.write and os.read doesn't support file with size superior to 2GB, ours is 4.2Gb. with open(path + 'tr_dico_nonponzi1.json','w') as f: f.write(json.dumps(tr_dico[1][:temp])) print('serialized half tr_dico') with open(path + 'tr_dico_nonponzi2.json','w') as f: f.write(json.dumps(tr_dico[1][temp:2*temp])) with open(path + 'tr_dico_nonponzi3.json','w') as f: f.write(json.dumps(tr_dico[1][2*temp:])) print('everything has been serialized') return tr_dico
out_file = os.path.join('result', name + '_out.out') color.pInfo('Sending outcoming transaction query to psql server') p.sendline('\o ' + out_file) p.expect('#') sq.val_sql(addr, query_out, p) color.pDone('Have generated ' + out_file + '.') # collect the query result into txn features txn_file = os.path.join('result', name + '_out.csv') deal_sql.deal_out(addr, out_file, txn_file) return txn_file if __name__ == '__main__': color.pInfo('Starting with addresse file in address folder') color.pInfo('Usage: python code/main.py') psql = 'psql --host 192.168.1.2 -U gby ethereum' addrs = ['116ponzi+116dapp_addr.csv'] #addrs = ['dapp1.csv','dapp2.csv','dapp3.csv','dapp4.csv'] #addrs = ['merged_'+str(i)+'.csv' for i in range(10)] # collect val and time sequence from addresses dirPath = 'address' # addrs = os.listdir(dirPath) p = connectPSQL(psql) times = [time.time()] #for addr in addrs: for i in range(1):
color.pDone('Have generated ' + out_file + '.') # collect the query result into txn features txn_file = os.path.join('result', name + '_out.csv') deal_sql.deal_out(addr, out_file, txn_file) return txn_file if __name__ == '__main__': Round = None try: Round = sys.argv[1] except: color.pInfo( 'Starting with collecting addresses, usage: python main.py [Round]' ) color.pInfo('If you have collected addresses in test_addr, ignore it.') # os.makedirs('log') # os.makedirs('sql') psql = 'psql --host 192.168.1.2 -U gby ethereum' if Round: # collect addresses p = connectPSQL(psql) for i in range(int(Round)): color.pInfo('Collecting round ', i) collectAddr(p) p.sendline('\q')