def put(self, request, original_variant, pk): # Allow to modify a variant in HBase/Impala f = open('/tmp/superhello.txt','w') f.write(json.dumps(request.data)) f.close() # We convert the original and modified data to flatjson fc = formatConverters(input_file='stuff.json',output_file='stuff.json') original_flatjson = fc.convertVariantJsonToFlatJson(json_data=original_variant) modified_flatjson = fc.convertVariantJsonToFlatJson(json_data=request.data) # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala) hbase_data = fc.convertVariantFlatJsonToHbase(original_data=original_flatjson,modified_data=modified_flatjson) # Impala - We make the query query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) query = hql_query("INSERT INTO variant("+",".join(query_data)+")") handle = db.execute_and_wait(query, timeout_sec=5.0) if handle: db.close(handle) else: raise Exception("Impossible to create the variant...") # HBase - We add the data in that table too hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() rowkey = pk hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)
def put(self, request, original_variant, pk): # Allow to modify a variant in HBase/Impala f = open('/tmp/superhello.txt', 'w') f.write(json.dumps(request.data)) f.close() # We convert the original and modified data to flatjson fc = formatConverters(input_file='stuff.json', output_file='stuff.json') original_flatjson = fc.convertVariantJsonToFlatJson( json_data=original_variant) modified_flatjson = fc.convertVariantJsonToFlatJson( json_data=request.data) # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala) hbase_data = fc.convertVariantFlatJsonToHbase( original_data=original_flatjson, modified_data=modified_flatjson) # Impala - We make the query query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) query = hql_query("INSERT INTO variant(" + ",".join(query_data) + ")") handle = db.execute_and_wait(query, timeout_sec=5.0) if handle: db.close(handle) else: raise Exception("Impossible to create the variant...") # HBase - We add the data in that table too hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() rowkey = pk hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)
def import_of_vcf(request, filename, length): # It is in charge to import a vcf (convert the vcf to avro, etc.), and as it is not fast, we should call # this method asynchronously7 # Connexion to the db try: query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) except Exception: return False hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() # To analyze the content of the vcf, we need to get it from the hdfs to this node buffer = min(length, 1024 * 1024 * 512) tmp_filename = 'import_' + request.user.username + '_' + str( int(time.time())) f = open('/tmp/cgs_' + tmp_filename + '.vcf', mode='w') for offset in xrange(0, length, buffer): tmp_vcf = request.fs.read(path='/user/' + request.user.username + '/' + filename, offset=offset, length=buffer, bufsize=buffer) f.write(tmp_vcf) f.close() # Now we try to analyze the vcf a little bit more with the correct tool st = time.time() convert = formatConverters(input_file='/tmp/cgs_' + tmp_filename + '.vcf', output_file='/tmp/cgs_' + tmp_filename + '.json', input_type='vcf', output_type='jsonflat') status, columns, ids_of_samples, rowkeys = convert.convertVcfToFlatJson( request=request, initial_file=filename) f = open('/tmp/cgs_superhello.txt', 'w') f.write('EXECUTION TIME to flat json:' + str(time.time() - st) + '\n') f.close() # We put the output on hdfs json_size = os.path.getsize('/tmp/cgs_' + tmp_filename + '.json') buffer = min(json_size, 1024 * 1024 * 50) st = time.time() with open('/tmp/cgs_' + tmp_filename + '.json', 'r') as content_file: request.fs.create('/user/cgs/' + tmp_filename + '.json', overwrite=True, data='') for offset in xrange(0, json_size, buffer): ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Pushing flatjson to hdfs (/user/cgs/' + tmp_filename + ')... ' + str(time.time() - st) + '\n') ftmp.close() request.fs.append('/user/cgs/' + tmp_filename + '.json', data=content_file.read(buffer)) # We eventually modify the avsc file with the new calls (well, in fact, we get the basic schema # and we will data from the existing db) avro_schema = {} with open('myapps/variants/variants.avsc', 'r') as content_file: avro_schema = json.loads(content_file.read()) with open('/tmp/cgs_' + tmp_filename + '.avsc', 'w') as f: f.write(json.dumps(avro_schema)) existing_columns = [] for field in avro_schema['fields']: existing_columns.append(field['name']) modified_avro_schema = False specific_columns = [] # Used below for the import in impala/hive for sample_id in ids_of_samples: destination_field = 'I_CALL_' + sample_id if destination_field not in specific_columns: specific_columns.append(destination_field) if destination_field not in existing_columns: # The current sample does not exist yet in the avsc file, we need to add it call_schema = { "name": destination_field, "type": ["string", "null"], "doc": "Column for a specific sample" } avro_schema['fields'].append(call_schema) existing_columns.append(destination_field) modified_avro_schema = True if modified_avro_schema is True: with open('/tmp/cgs_' + tmp_filename + '.avsc', 'w') as content_file: content_file.write(json.dumps(avro_schema)) request.fs.create('/user/cgs/cgs_' + tmp_filename + '.avsc', overwrite=True, data=json.dumps(avro_schema)) # We convert the flat json to hbase (mostly a key mapping) st = time.time() convert = formatConverters(input_file='/tmp/cgs_' + tmp_filename + '.json', output_file='/tmp/cgs_' + tmp_filename + '.hbase', input_type='jsonflat', output_type='hbase') status = convert.convertFlatJsonToHbase() ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Conversion from flatjson to hbase... ' + str(time.time() - st) + '\n') ftmp.close() # We put the hbase file on hdfs hbase_length = os.path.getsize('/tmp/cgs_' + tmp_filename + '.hbase') buffer = min(hbase_length, 1024 * 1024 * 50) st = time.time() with open('/tmp/cgs_' + tmp_filename + '.hbase', 'r') as content_file: request.fs.create('/user/cgs/' + tmp_filename + '.hbase', overwrite=True, data='') for offset in xrange(0, hbase_length, buffer): cont = content_file.read(buffer) ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Pushing hbase to hdfs (/user/cgs/' + tmp_filename + '.hbase)... ' + str(time.time() - st) + '\n') ftmp.close() request.fs.append('/user/cgs/' + tmp_filename + '.hbase', data=cont) # We convert the hbase to avro file st = time.time() convert = formatConverters( input_file='/tmp/cgs_' + tmp_filename + '.hbase', output_file='/tmp/cgs_' + tmp_filename + '.avro', input_type='jsonflat', output_type='avro') status = convert.convertHbaseToAvro(avscFile='/tmp/cgs_' + tmp_filename + '.avsc') ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Conversion from hbase to avro... ' + str(time.time() - st) + '\n') ftmp.close() # We put the avro file on hdfs st = time.time() avro_length = os.path.getsize('/tmp/cgs_' + tmp_filename + '.avro') buffer = min(avro_length, 1024 * 1024 * 50) with open('/tmp/cgs_' + tmp_filename + '.avro', 'r') as content_file: request.fs.create('/user/cgs/' + tmp_filename + '.avro', overwrite=True, data='') request.fs.create('/user/cgs/' + tmp_filename + '.archive.avro', overwrite=True, data='') for offset in xrange(0, avro_length, buffer): cont = content_file.read(buffer) ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Pushing avro to hdfs (/user/cgs/' + tmp_filename + '.avro)... ' + str(time.time() - st) + '\n') ftmp.close() request.fs.append('/user/cgs/' + tmp_filename + '.avro', data=cont) request.fs.append('/user/cgs/' + tmp_filename + '.archive.avro', data=cont) tmpf = open('/tmp/cgs_superhello.txt', 'a') # O: We get the columns from the parquet table to detect missing columns for the new calls we just created query = hql_query("show column stats variants") handle = db.execute_and_wait(query, timeout_sec=30.0) data = db.fetch(handle, rows=1000000) rows = list(data.rows()) columns_for_new_calls = [] existing_calls_columns = [] for row in rows: current_column = row[0] if current_column.startswith('i_call_'): existing_calls_columns.append(str(current_column).lower()) for current_sample in ids_of_samples: destination_field = str('I_CALL_' + current_sample).lower() if destination_field not in existing_calls_columns and destination_field not in columns_for_new_calls: columns_for_new_calls.append(destination_field) tmpf.write("Existing calls: " + json.dumps(existing_calls_columns) + "\r\n") tmpf.write("New calls: " + json.dumps(columns_for_new_calls)) tmpf.close() # 1st: we create a temporary hive table with avro storage st = time.time() result, variants_table = database_create_variants( request, temporary=True, specific_columns=specific_columns) tmpf = open('/tmp/cgs_superhello.txt', 'a') # 2nd: we import the previously created avro file inside the temporary avro table query_server = get_query_server_config(name='hive') hive_db = dbms.get(request.user, query_server=query_server) variants_columns = [] for variants_column in variants_table: variants_columns.append(str(variants_column).split(' ').pop(0)) query = hql_query("load data inpath '/user/cgs/" + tmp_filename + ".avro' into table variants_tmp_" + request.user.username + ";") handle = hive_db.execute_and_wait(query, timeout_sec=3600.0) # Necessary for impala to detect an hive table query = hql_query("invalidate metadata;") handle = db.execute_and_wait(query, timeout_sec=30.0) # 3rd: we eventually modify the global parquet table to add the eventual new columns for each call if len(columns_for_new_calls) > 0: query = hql_query("alter table variants add columns (" + ' STRING, '.join(columns_for_new_calls) + " STRING)") handle = db.execute_and_wait(query, timeout_sec=3600.0) # 4th: we import the data from the temporary avro table to the global parquet table query = hql_query("insert into table variants (" + ','.join(variants_columns) + ") select " + ','.join(variants_columns) + " from variants_tmp_" + request.user.username + " ;") handle = db.execute_and_wait(query, timeout_sec=3600.0) # 5th: we delete the temporary table #query = hql_query("drop table variants_tmp_"+request.user.username+";") #handle = hive_db.execute_and_wait(query, timeout_sec=30.0) ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write( 'Creation of temporary table, import to global variants table (parquet): ' + str(time.time() - st) + '\n') ftmp.close() st = time.time() # We put the data in HBase. For now we do it simply, but we should use the bulk upload (TODO) with open('/tmp/cgs_' + tmp_filename + '.hbase', 'r') as content_file: for line in content_file: # We create the json content hbase_data = json.loads(line) rowkey = hbase_data['rowkey'] del hbase_data['rowkey'] del hbase_data['pk'] # We can save the new variant hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data) ftmp = open('/tmp/cgs_superhello.txt', 'a') ftmp.write('Import into HBase: ' + str(time.time() - st) + '\n') ftmp.close() # We delete the temporary file previously created on this node os.remove('/tmp/cgs_' + tmp_filename + '.avsc') os.remove('/tmp/cgs_' + tmp_filename + '.vcf') os.remove('/tmp/cgs_' + tmp_filename + '.json') os.remove('/tmp/cgs_' + tmp_filename + '.avro') os.remove('/tmp/cgs_' + tmp_filename + '.hbase') return True
def import_of_vcf(request, filename, length): # It is in charge to import a vcf (convert the vcf to avro, etc.), and as it is not fast, we should call # this method asynchronously7 # Connexion to the db try: query_server = get_query_server_config(name='impala') db = dbms.get(request.user, query_server=query_server) except Exception: return False hbaseApi = HbaseApi(user=request.user) currentCluster = hbaseApi.getClusters().pop() # To analyze the content of the vcf, we need to get it from the hdfs to this node buffer = min(length,1024*1024*512) tmp_filename = 'import_'+request.user.username+'_'+str(int(time.time())) f = open('/tmp/cgs_'+tmp_filename+'.vcf',mode='w') for offset in xrange(0, length, buffer): tmp_vcf = request.fs.read(path='/user/'+request.user.username+'/'+filename, offset=offset, length=buffer, bufsize=buffer) f.write(tmp_vcf) f.close() # Now we try to analyze the vcf a little bit more with the correct tool st = time.time() convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.vcf',output_file='/tmp/cgs_'+tmp_filename+'.json',input_type='vcf',output_type='jsonflat') status, columns, ids_of_samples, rowkeys = convert.convertVcfToFlatJson(request=request, initial_file=filename) f = open('/tmp/cgs_superhello.txt','w') f.write('EXECUTION TIME to flat json:'+str(time.time()-st)+'\n') f.close() # We put the output on hdfs json_size = os.path.getsize('/tmp/cgs_'+tmp_filename+'.json') buffer = min(json_size, 1024*1024*50) st = time.time() with open('/tmp/cgs_'+tmp_filename+'.json', 'r') as content_file: request.fs.create('/user/cgs/'+tmp_filename+'.json', overwrite=True, data='') for offset in xrange(0, json_size, buffer): ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Pushing flatjson to hdfs (/user/cgs/'+tmp_filename+')... '+str(time.time()-st)+'\n') ftmp.close() request.fs.append('/user/cgs/'+tmp_filename+'.json', data=content_file.read(buffer)) # We eventually modify the avsc file with the new calls (well, in fact, we get the basic schema # and we will data from the existing db) avro_schema = {} with open('myapps/variants/variants.avsc','r') as content_file: avro_schema = json.loads(content_file.read()) with open('/tmp/cgs_'+tmp_filename+'.avsc','w') as f: f.write(json.dumps(avro_schema)) existing_columns = [] for field in avro_schema['fields']: existing_columns.append(field['name']) modified_avro_schema = False specific_columns = [] # Used below for the import in impala/hive for sample_id in ids_of_samples: destination_field = 'I_CALL_'+sample_id if destination_field not in specific_columns: specific_columns.append(destination_field) if destination_field not in existing_columns: # The current sample does not exist yet in the avsc file, we need to add it call_schema = {"name":destination_field,"type":["string","null"],"doc":"Column for a specific sample"} avro_schema['fields'].append(call_schema) existing_columns.append(destination_field) modified_avro_schema = True if modified_avro_schema is True: with open('/tmp/cgs_'+tmp_filename+'.avsc','w') as content_file: content_file.write(json.dumps(avro_schema)) request.fs.create('/user/cgs/cgs_'+tmp_filename+'.avsc', overwrite=True, data=json.dumps(avro_schema)) # We convert the flat json to hbase (mostly a key mapping) st = time.time() convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.json',output_file='/tmp/cgs_'+tmp_filename+'.hbase',input_type='jsonflat',output_type='hbase') status = convert.convertFlatJsonToHbase() ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Conversion from flatjson to hbase... '+str(time.time()-st)+'\n') ftmp.close() # We put the hbase file on hdfs hbase_length = os.path.getsize('/tmp/cgs_'+tmp_filename+'.hbase') buffer = min(hbase_length,1024*1024*50) st = time.time() with open('/tmp/cgs_'+tmp_filename+'.hbase', 'r') as content_file: request.fs.create('/user/cgs/'+tmp_filename+'.hbase', overwrite=True, data='') for offset in xrange(0, hbase_length, buffer): cont = content_file.read(buffer) ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Pushing hbase to hdfs (/user/cgs/'+tmp_filename+'.hbase)... '+str(time.time()-st)+'\n') ftmp.close() request.fs.append('/user/cgs/'+tmp_filename+'.hbase', data=cont) # We convert the hbase to avro file st = time.time() convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.hbase',output_file='/tmp/cgs_'+tmp_filename+'.avro',input_type='jsonflat',output_type='avro') status = convert.convertHbaseToAvro(avscFile='/tmp/cgs_'+tmp_filename+'.avsc') ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Conversion from hbase to avro... '+str(time.time()-st)+'\n') ftmp.close() # We put the avro file on hdfs st = time.time() avro_length = os.path.getsize('/tmp/cgs_'+tmp_filename+'.avro') buffer = min(avro_length, 1024*1024*50) with open('/tmp/cgs_'+tmp_filename+'.avro', 'r') as content_file: request.fs.create('/user/cgs/'+tmp_filename+'.avro', overwrite=True, data='') request.fs.create('/user/cgs/'+tmp_filename+'.archive.avro', overwrite=True, data='') for offset in xrange(0, avro_length, buffer): cont = content_file.read(buffer) ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Pushing avro to hdfs (/user/cgs/'+tmp_filename+'.avro)... '+str(time.time()-st)+'\n') ftmp.close() request.fs.append('/user/cgs/'+tmp_filename+'.avro', data=cont) request.fs.append('/user/cgs/'+tmp_filename+'.archive.avro', data=cont) tmpf = open('/tmp/cgs_superhello.txt','a') # O: We get the columns from the parquet table to detect missing columns for the new calls we just created query = hql_query("show column stats variants") handle = db.execute_and_wait(query, timeout_sec=30.0) data = db.fetch(handle, rows=1000000) rows = list(data.rows()) columns_for_new_calls = [] existing_calls_columns = [] for row in rows: current_column = row[0] if current_column.startswith('i_call_'): existing_calls_columns.append(str(current_column).lower()) for current_sample in ids_of_samples: destination_field = str('I_CALL_'+current_sample).lower() if destination_field not in existing_calls_columns and destination_field not in columns_for_new_calls: columns_for_new_calls.append(destination_field) tmpf.write("Existing calls: "+json.dumps(existing_calls_columns)+"\r\n") tmpf.write("New calls: "+json.dumps(columns_for_new_calls)) tmpf.close() # 1st: we create a temporary hive table with avro storage st = time.time() result, variants_table = database_create_variants(request, temporary=True, specific_columns=specific_columns) tmpf = open('/tmp/cgs_superhello.txt','a') # 2nd: we import the previously created avro file inside the temporary avro table query_server = get_query_server_config(name='hive') hive_db = dbms.get(request.user, query_server=query_server) variants_columns = [] for variants_column in variants_table: variants_columns.append(str(variants_column).split(' ').pop(0)) query = hql_query("load data inpath '/user/cgs/"+tmp_filename+".avro' into table variants_tmp_"+request.user.username+";") handle = hive_db.execute_and_wait(query, timeout_sec=3600.0) # Necessary for impala to detect an hive table query = hql_query("invalidate metadata;") handle = db.execute_and_wait(query, timeout_sec=30.0) # 3rd: we eventually modify the global parquet table to add the eventual new columns for each call if len(columns_for_new_calls) > 0: query = hql_query("alter table variants add columns ("+' STRING, '.join(columns_for_new_calls)+" STRING)") handle = db.execute_and_wait(query, timeout_sec=3600.0) # 4th: we import the data from the temporary avro table to the global parquet table query = hql_query("insert into table variants ("+','.join(variants_columns)+") select "+','.join(variants_columns)+" from variants_tmp_"+request.user.username+" ;") handle = db.execute_and_wait(query, timeout_sec=3600.0) # 5th: we delete the temporary table #query = hql_query("drop table variants_tmp_"+request.user.username+";") #handle = hive_db.execute_and_wait(query, timeout_sec=30.0) ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Creation of temporary table, import to global variants table (parquet): '+str(time.time()-st)+'\n') ftmp.close() st = time.time() # We put the data in HBase. For now we do it simply, but we should use the bulk upload (TODO) with open('/tmp/cgs_'+tmp_filename+'.hbase', 'r') as content_file: for line in content_file: # We create the json content hbase_data = json.loads(line) rowkey = hbase_data['rowkey'] del hbase_data['rowkey'] del hbase_data['pk'] # We can save the new variant hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data) ftmp = open('/tmp/cgs_superhello.txt','a') ftmp.write('Import into HBase: '+str(time.time()-st)+'\n') ftmp.close() # We delete the temporary file previously created on this node os.remove('/tmp/cgs_'+tmp_filename+'.avsc') os.remove('/tmp/cgs_'+tmp_filename+'.vcf') os.remove('/tmp/cgs_'+tmp_filename+'.json') os.remove('/tmp/cgs_'+tmp_filename+'.avro') os.remove('/tmp/cgs_'+tmp_filename+'.hbase') return True