Esempio n. 1
0
    def put(self, request, original_variant, pk):
        # Allow to modify a variant in HBase/Impala

        f = open('/tmp/superhello.txt','w')
        f.write(json.dumps(request.data))
        f.close()

        # We convert the original and modified data to flatjson
        fc = formatConverters(input_file='stuff.json',output_file='stuff.json')
        original_flatjson = fc.convertVariantJsonToFlatJson(json_data=original_variant)
        modified_flatjson = fc.convertVariantJsonToFlatJson(json_data=request.data)

        # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala)
        hbase_data = fc.convertVariantFlatJsonToHbase(original_data=original_flatjson,modified_data=modified_flatjson)


        # Impala - We make the query
        query_server = get_query_server_config(name='impala')
        db = dbms.get(request.user, query_server=query_server)
        query = hql_query("INSERT INTO variant("+",".join(query_data)+")")
        handle = db.execute_and_wait(query, timeout_sec=5.0)
        if handle:
            db.close(handle)
        else:
            raise Exception("Impossible to create the variant...")

        # HBase - We add the data in that table too
        hbaseApi = HbaseApi(user=request.user)
        currentCluster = hbaseApi.getClusters().pop()
        rowkey = pk
        hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)
Esempio n. 2
0
    def put(self, request, original_variant, pk):
        # Allow to modify a variant in HBase/Impala

        f = open('/tmp/superhello.txt', 'w')
        f.write(json.dumps(request.data))
        f.close()

        # We convert the original and modified data to flatjson
        fc = formatConverters(input_file='stuff.json',
                              output_file='stuff.json')
        original_flatjson = fc.convertVariantJsonToFlatJson(
            json_data=original_variant)
        modified_flatjson = fc.convertVariantJsonToFlatJson(
            json_data=request.data)

        # We convert the data to hbase, and we modify directly some fields (note: the keys are almost the same for hbase and impala)
        hbase_data = fc.convertVariantFlatJsonToHbase(
            original_data=original_flatjson, modified_data=modified_flatjson)

        # Impala - We make the query
        query_server = get_query_server_config(name='impala')
        db = dbms.get(request.user, query_server=query_server)
        query = hql_query("INSERT INTO variant(" + ",".join(query_data) + ")")
        handle = db.execute_and_wait(query, timeout_sec=5.0)
        if handle:
            db.close(handle)
        else:
            raise Exception("Impossible to create the variant...")

        # HBase - We add the data in that table too
        hbaseApi = HbaseApi(user=request.user)
        currentCluster = hbaseApi.getClusters().pop()
        rowkey = pk
        hbaseApi.putRow(cluster=currentCluster['name'],
                        tableName='variants',
                        row=rowkey,
                        data=hbase_data)
Esempio n. 3
0
def import_of_vcf(request, filename, length):
    # It is in charge to import a vcf (convert the vcf to avro, etc.), and as it is not fast, we should call
    # this method asynchronously7

    # Connexion to the db
    try:
        query_server = get_query_server_config(name='impala')
        db = dbms.get(request.user, query_server=query_server)
    except Exception:
        return False

    hbaseApi = HbaseApi(user=request.user)
    currentCluster = hbaseApi.getClusters().pop()

    # To analyze the content of the vcf, we need to get it from the hdfs to this node
    buffer = min(length, 1024 * 1024 * 512)
    tmp_filename = 'import_' + request.user.username + '_' + str(
        int(time.time()))
    f = open('/tmp/cgs_' + tmp_filename + '.vcf', mode='w')
    for offset in xrange(0, length, buffer):
        tmp_vcf = request.fs.read(path='/user/' + request.user.username + '/' +
                                  filename,
                                  offset=offset,
                                  length=buffer,
                                  bufsize=buffer)
        f.write(tmp_vcf)
    f.close()

    # Now we try to analyze the vcf a little bit more with the correct tool
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_' + tmp_filename + '.vcf',
                               output_file='/tmp/cgs_' + tmp_filename +
                               '.json',
                               input_type='vcf',
                               output_type='jsonflat')
    status, columns, ids_of_samples, rowkeys = convert.convertVcfToFlatJson(
        request=request, initial_file=filename)
    f = open('/tmp/cgs_superhello.txt', 'w')
    f.write('EXECUTION TIME to flat json:' + str(time.time() - st) + '\n')
    f.close()

    # We put the output on hdfs
    json_size = os.path.getsize('/tmp/cgs_' + tmp_filename + '.json')
    buffer = min(json_size, 1024 * 1024 * 50)
    st = time.time()
    with open('/tmp/cgs_' + tmp_filename + '.json', 'r') as content_file:
        request.fs.create('/user/cgs/' + tmp_filename + '.json',
                          overwrite=True,
                          data='')
        for offset in xrange(0, json_size, buffer):
            ftmp = open('/tmp/cgs_superhello.txt', 'a')
            ftmp.write('Pushing flatjson to hdfs (/user/cgs/' + tmp_filename +
                       ')... ' + str(time.time() - st) + '\n')
            ftmp.close()
            request.fs.append('/user/cgs/' + tmp_filename + '.json',
                              data=content_file.read(buffer))

    # We eventually modify the avsc file with the new calls (well, in fact, we get the basic schema
    # and we will data from the existing db)
    avro_schema = {}
    with open('myapps/variants/variants.avsc', 'r') as content_file:
        avro_schema = json.loads(content_file.read())
        with open('/tmp/cgs_' + tmp_filename + '.avsc', 'w') as f:
            f.write(json.dumps(avro_schema))

    existing_columns = []
    for field in avro_schema['fields']:
        existing_columns.append(field['name'])
    modified_avro_schema = False
    specific_columns = []  # Used below for the import in impala/hive
    for sample_id in ids_of_samples:
        destination_field = 'I_CALL_' + sample_id

        if destination_field not in specific_columns:
            specific_columns.append(destination_field)

        if destination_field not in existing_columns:
            # The current sample does not exist yet in the avsc file, we need to add it
            call_schema = {
                "name": destination_field,
                "type": ["string", "null"],
                "doc": "Column for a specific sample"
            }
            avro_schema['fields'].append(call_schema)
            existing_columns.append(destination_field)
            modified_avro_schema = True

    if modified_avro_schema is True:
        with open('/tmp/cgs_' + tmp_filename + '.avsc', 'w') as content_file:
            content_file.write(json.dumps(avro_schema))

        request.fs.create('/user/cgs/cgs_' + tmp_filename + '.avsc',
                          overwrite=True,
                          data=json.dumps(avro_schema))

    # We convert the flat json to hbase (mostly a key mapping)
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_' + tmp_filename + '.json',
                               output_file='/tmp/cgs_' + tmp_filename +
                               '.hbase',
                               input_type='jsonflat',
                               output_type='hbase')
    status = convert.convertFlatJsonToHbase()
    ftmp = open('/tmp/cgs_superhello.txt', 'a')
    ftmp.write('Conversion from flatjson to hbase... ' +
               str(time.time() - st) + '\n')
    ftmp.close()

    # We put the hbase file on hdfs
    hbase_length = os.path.getsize('/tmp/cgs_' + tmp_filename + '.hbase')
    buffer = min(hbase_length, 1024 * 1024 * 50)
    st = time.time()
    with open('/tmp/cgs_' + tmp_filename + '.hbase', 'r') as content_file:
        request.fs.create('/user/cgs/' + tmp_filename + '.hbase',
                          overwrite=True,
                          data='')
        for offset in xrange(0, hbase_length, buffer):
            cont = content_file.read(buffer)
            ftmp = open('/tmp/cgs_superhello.txt', 'a')
            ftmp.write('Pushing hbase to hdfs (/user/cgs/' + tmp_filename +
                       '.hbase)... ' + str(time.time() - st) + '\n')
            ftmp.close()
            request.fs.append('/user/cgs/' + tmp_filename + '.hbase',
                              data=cont)

    # We convert the hbase to avro file
    st = time.time()
    convert = formatConverters(
        input_file='/tmp/cgs_' + tmp_filename + '.hbase',
        output_file='/tmp/cgs_' + tmp_filename + '.avro',
        input_type='jsonflat',
        output_type='avro')
    status = convert.convertHbaseToAvro(avscFile='/tmp/cgs_' + tmp_filename +
                                        '.avsc')

    ftmp = open('/tmp/cgs_superhello.txt', 'a')
    ftmp.write('Conversion from hbase to avro... ' + str(time.time() - st) +
               '\n')
    ftmp.close()

    # We put the avro file on hdfs
    st = time.time()
    avro_length = os.path.getsize('/tmp/cgs_' + tmp_filename + '.avro')
    buffer = min(avro_length, 1024 * 1024 * 50)
    with open('/tmp/cgs_' + tmp_filename + '.avro', 'r') as content_file:
        request.fs.create('/user/cgs/' + tmp_filename + '.avro',
                          overwrite=True,
                          data='')
        request.fs.create('/user/cgs/' + tmp_filename + '.archive.avro',
                          overwrite=True,
                          data='')
        for offset in xrange(0, avro_length, buffer):
            cont = content_file.read(buffer)
            ftmp = open('/tmp/cgs_superhello.txt', 'a')
            ftmp.write('Pushing avro to hdfs (/user/cgs/' + tmp_filename +
                       '.avro)... ' + str(time.time() - st) + '\n')
            ftmp.close()
            request.fs.append('/user/cgs/' + tmp_filename + '.avro', data=cont)
            request.fs.append('/user/cgs/' + tmp_filename + '.archive.avro',
                              data=cont)

    tmpf = open('/tmp/cgs_superhello.txt', 'a')
    # O: We get the columns from the parquet table to detect missing columns for the new calls we just created
    query = hql_query("show column stats variants")
    handle = db.execute_and_wait(query, timeout_sec=30.0)
    data = db.fetch(handle, rows=1000000)
    rows = list(data.rows())
    columns_for_new_calls = []
    existing_calls_columns = []
    for row in rows:
        current_column = row[0]
        if current_column.startswith('i_call_'):
            existing_calls_columns.append(str(current_column).lower())

    for current_sample in ids_of_samples:
        destination_field = str('I_CALL_' + current_sample).lower()
        if destination_field not in existing_calls_columns and destination_field not in columns_for_new_calls:
            columns_for_new_calls.append(destination_field)
    tmpf.write("Existing calls: " + json.dumps(existing_calls_columns) +
               "\r\n")
    tmpf.write("New calls: " + json.dumps(columns_for_new_calls))
    tmpf.close()

    # 1st: we create a temporary hive table with avro storage
    st = time.time()
    result, variants_table = database_create_variants(
        request, temporary=True, specific_columns=specific_columns)

    tmpf = open('/tmp/cgs_superhello.txt', 'a')
    # 2nd: we import the previously created avro file inside the temporary avro table
    query_server = get_query_server_config(name='hive')
    hive_db = dbms.get(request.user, query_server=query_server)
    variants_columns = []
    for variants_column in variants_table:
        variants_columns.append(str(variants_column).split(' ').pop(0))

    query = hql_query("load data inpath '/user/cgs/" + tmp_filename +
                      ".avro' into table variants_tmp_" +
                      request.user.username + ";")
    handle = hive_db.execute_and_wait(query, timeout_sec=3600.0)

    # Necessary for impala to detect an hive table
    query = hql_query("invalidate metadata;")
    handle = db.execute_and_wait(query, timeout_sec=30.0)

    # 3rd: we eventually modify the global parquet table to add the eventual new columns for each call
    if len(columns_for_new_calls) > 0:
        query = hql_query("alter table variants add columns (" +
                          ' STRING, '.join(columns_for_new_calls) + " STRING)")
        handle = db.execute_and_wait(query, timeout_sec=3600.0)

    # 4th: we import the data from the temporary avro table to the global parquet table
    query = hql_query("insert into table variants (" +
                      ','.join(variants_columns) + ") select " +
                      ','.join(variants_columns) + " from variants_tmp_" +
                      request.user.username + " ;")
    handle = db.execute_and_wait(query, timeout_sec=3600.0)

    # 5th: we delete the temporary table
    #query = hql_query("drop table variants_tmp_"+request.user.username+";")
    #handle = hive_db.execute_and_wait(query, timeout_sec=30.0)
    ftmp = open('/tmp/cgs_superhello.txt', 'a')
    ftmp.write(
        'Creation of temporary table, import to global variants table (parquet): '
        + str(time.time() - st) + '\n')
    ftmp.close()

    st = time.time()
    # We put the data in HBase. For now we do it simply, but we should use the bulk upload (TODO)
    with open('/tmp/cgs_' + tmp_filename + '.hbase', 'r') as content_file:
        for line in content_file:
            # We create the json content
            hbase_data = json.loads(line)
            rowkey = hbase_data['rowkey']
            del hbase_data['rowkey']
            del hbase_data['pk']

            # We can save the new variant
            hbaseApi.putRow(cluster=currentCluster['name'],
                            tableName='variants',
                            row=rowkey,
                            data=hbase_data)

    ftmp = open('/tmp/cgs_superhello.txt', 'a')
    ftmp.write('Import into HBase: ' + str(time.time() - st) + '\n')
    ftmp.close()

    # We delete the temporary file previously created on this node
    os.remove('/tmp/cgs_' + tmp_filename + '.avsc')
    os.remove('/tmp/cgs_' + tmp_filename + '.vcf')
    os.remove('/tmp/cgs_' + tmp_filename + '.json')
    os.remove('/tmp/cgs_' + tmp_filename + '.avro')
    os.remove('/tmp/cgs_' + tmp_filename + '.hbase')

    return True
Esempio n. 4
0
def import_of_vcf(request, filename, length):
    # It is in charge to import a vcf (convert the vcf to avro, etc.), and as it is not fast, we should call
    # this method asynchronously7

    # Connexion to the db
    try:
        query_server = get_query_server_config(name='impala')
        db = dbms.get(request.user, query_server=query_server)
    except Exception:
        return False

    hbaseApi = HbaseApi(user=request.user)
    currentCluster = hbaseApi.getClusters().pop()

    # To analyze the content of the vcf, we need to get it from the hdfs to this node
    buffer = min(length,1024*1024*512)
    tmp_filename = 'import_'+request.user.username+'_'+str(int(time.time()))
    f = open('/tmp/cgs_'+tmp_filename+'.vcf',mode='w')
    for offset in xrange(0, length, buffer):
        tmp_vcf = request.fs.read(path='/user/'+request.user.username+'/'+filename, offset=offset, length=buffer, bufsize=buffer)
        f.write(tmp_vcf)
    f.close()

    # Now we try to analyze the vcf a little bit more with the correct tool
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.vcf',output_file='/tmp/cgs_'+tmp_filename+'.json',input_type='vcf',output_type='jsonflat')
    status, columns, ids_of_samples, rowkeys = convert.convertVcfToFlatJson(request=request, initial_file=filename)
    f = open('/tmp/cgs_superhello.txt','w')
    f.write('EXECUTION TIME to flat json:'+str(time.time()-st)+'\n')
    f.close()

    # We put the output on hdfs
    json_size = os.path.getsize('/tmp/cgs_'+tmp_filename+'.json')
    buffer = min(json_size, 1024*1024*50)
    st = time.time()
    with open('/tmp/cgs_'+tmp_filename+'.json', 'r') as content_file:
        request.fs.create('/user/cgs/'+tmp_filename+'.json', overwrite=True, data='')
        for offset in xrange(0, json_size, buffer):
            ftmp = open('/tmp/cgs_superhello.txt','a')
            ftmp.write('Pushing flatjson to hdfs (/user/cgs/'+tmp_filename+')... '+str(time.time()-st)+'\n')
            ftmp.close()
            request.fs.append('/user/cgs/'+tmp_filename+'.json', data=content_file.read(buffer))

    # We eventually modify the avsc file with the new calls (well, in fact, we get the basic schema
    # and we will data from the existing db)
    avro_schema = {}
    with open('myapps/variants/variants.avsc','r') as content_file:
        avro_schema = json.loads(content_file.read())
        with open('/tmp/cgs_'+tmp_filename+'.avsc','w') as f:
            f.write(json.dumps(avro_schema))

    existing_columns = []
    for field in avro_schema['fields']:
        existing_columns.append(field['name'])
    modified_avro_schema = False
    specific_columns = [] # Used below for the import in impala/hive
    for sample_id in ids_of_samples:
        destination_field = 'I_CALL_'+sample_id

        if destination_field not in specific_columns:
            specific_columns.append(destination_field)

        if destination_field not in existing_columns:
            # The current sample does not exist yet in the avsc file, we need to add it
            call_schema = {"name":destination_field,"type":["string","null"],"doc":"Column for a specific sample"}
            avro_schema['fields'].append(call_schema)
            existing_columns.append(destination_field)
            modified_avro_schema = True

    if modified_avro_schema is True:
        with open('/tmp/cgs_'+tmp_filename+'.avsc','w') as content_file:
            content_file.write(json.dumps(avro_schema))

        request.fs.create('/user/cgs/cgs_'+tmp_filename+'.avsc', overwrite=True, data=json.dumps(avro_schema))

    # We convert the flat json to hbase (mostly a key mapping)
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.json',output_file='/tmp/cgs_'+tmp_filename+'.hbase',input_type='jsonflat',output_type='hbase')
    status = convert.convertFlatJsonToHbase()
    ftmp = open('/tmp/cgs_superhello.txt','a')
    ftmp.write('Conversion from flatjson to hbase... '+str(time.time()-st)+'\n')
    ftmp.close()

    # We put the hbase file on hdfs
    hbase_length = os.path.getsize('/tmp/cgs_'+tmp_filename+'.hbase')
    buffer = min(hbase_length,1024*1024*50)
    st = time.time()
    with open('/tmp/cgs_'+tmp_filename+'.hbase', 'r') as content_file:
        request.fs.create('/user/cgs/'+tmp_filename+'.hbase', overwrite=True, data='')
        for offset in xrange(0, hbase_length, buffer):
            cont = content_file.read(buffer)
            ftmp = open('/tmp/cgs_superhello.txt','a')
            ftmp.write('Pushing hbase to hdfs (/user/cgs/'+tmp_filename+'.hbase)... '+str(time.time()-st)+'\n')
            ftmp.close()
            request.fs.append('/user/cgs/'+tmp_filename+'.hbase', data=cont)

    # We convert the hbase to avro file
    st = time.time()
    convert = formatConverters(input_file='/tmp/cgs_'+tmp_filename+'.hbase',output_file='/tmp/cgs_'+tmp_filename+'.avro',input_type='jsonflat',output_type='avro')
    status = convert.convertHbaseToAvro(avscFile='/tmp/cgs_'+tmp_filename+'.avsc')

    ftmp = open('/tmp/cgs_superhello.txt','a')
    ftmp.write('Conversion from hbase to avro... '+str(time.time()-st)+'\n')
    ftmp.close()

    # We put the avro file on hdfs
    st = time.time()
    avro_length = os.path.getsize('/tmp/cgs_'+tmp_filename+'.avro')
    buffer = min(avro_length, 1024*1024*50)
    with open('/tmp/cgs_'+tmp_filename+'.avro', 'r') as content_file:
        request.fs.create('/user/cgs/'+tmp_filename+'.avro', overwrite=True, data='')
        request.fs.create('/user/cgs/'+tmp_filename+'.archive.avro', overwrite=True, data='')
        for offset in xrange(0, avro_length, buffer):
            cont = content_file.read(buffer)
            ftmp = open('/tmp/cgs_superhello.txt','a')
            ftmp.write('Pushing avro to hdfs (/user/cgs/'+tmp_filename+'.avro)... '+str(time.time()-st)+'\n')
            ftmp.close()
            request.fs.append('/user/cgs/'+tmp_filename+'.avro', data=cont)
            request.fs.append('/user/cgs/'+tmp_filename+'.archive.avro', data=cont)

    tmpf = open('/tmp/cgs_superhello.txt','a')
    # O: We get the columns from the parquet table to detect missing columns for the new calls we just created
    query = hql_query("show column stats variants")
    handle = db.execute_and_wait(query, timeout_sec=30.0)
    data = db.fetch(handle, rows=1000000)
    rows = list(data.rows())
    columns_for_new_calls = []
    existing_calls_columns = []
    for row in rows:
        current_column = row[0]
        if current_column.startswith('i_call_'):
            existing_calls_columns.append(str(current_column).lower())

    for current_sample in ids_of_samples:
        destination_field = str('I_CALL_'+current_sample).lower()
        if destination_field not in existing_calls_columns and destination_field not in columns_for_new_calls:
            columns_for_new_calls.append(destination_field)
    tmpf.write("Existing calls: "+json.dumps(existing_calls_columns)+"\r\n")
    tmpf.write("New calls: "+json.dumps(columns_for_new_calls))
    tmpf.close()

    # 1st: we create a temporary hive table with avro storage
    st = time.time()
    result, variants_table = database_create_variants(request, temporary=True, specific_columns=specific_columns)

    tmpf = open('/tmp/cgs_superhello.txt','a')
    # 2nd: we import the previously created avro file inside the temporary avro table
    query_server = get_query_server_config(name='hive')
    hive_db = dbms.get(request.user, query_server=query_server)
    variants_columns = []
    for variants_column in variants_table:
        variants_columns.append(str(variants_column).split(' ').pop(0))

    query = hql_query("load data inpath '/user/cgs/"+tmp_filename+".avro' into table variants_tmp_"+request.user.username+";")
    handle = hive_db.execute_and_wait(query, timeout_sec=3600.0)

    # Necessary for impala to detect an hive table
    query = hql_query("invalidate metadata;")
    handle = db.execute_and_wait(query, timeout_sec=30.0)

    # 3rd: we eventually modify the global parquet table to add the eventual new columns for each call
    if len(columns_for_new_calls) > 0:
        query = hql_query("alter table variants add columns ("+' STRING, '.join(columns_for_new_calls)+" STRING)")
        handle = db.execute_and_wait(query, timeout_sec=3600.0)

    # 4th: we import the data from the temporary avro table to the global parquet table
    query = hql_query("insert into table variants ("+','.join(variants_columns)+") select "+','.join(variants_columns)+" from variants_tmp_"+request.user.username+" ;")
    handle = db.execute_and_wait(query, timeout_sec=3600.0)

    # 5th: we delete the temporary table
    #query = hql_query("drop table variants_tmp_"+request.user.username+";")
    #handle = hive_db.execute_and_wait(query, timeout_sec=30.0)
    ftmp = open('/tmp/cgs_superhello.txt','a')
    ftmp.write('Creation of temporary table, import to global variants table (parquet): '+str(time.time()-st)+'\n')
    ftmp.close()

    st = time.time()
    # We put the data in HBase. For now we do it simply, but we should use the bulk upload (TODO)
    with open('/tmp/cgs_'+tmp_filename+'.hbase', 'r') as content_file:
        for line in content_file:
            # We create the json content
            hbase_data = json.loads(line)
            rowkey = hbase_data['rowkey']
            del hbase_data['rowkey']
            del hbase_data['pk']

            # We can save the new variant
            hbaseApi.putRow(cluster=currentCluster['name'], tableName='variants', row=rowkey, data=hbase_data)

    ftmp = open('/tmp/cgs_superhello.txt','a')
    ftmp.write('Import into HBase: '+str(time.time()-st)+'\n')
    ftmp.close()

    # We delete the temporary file previously created on this node
    os.remove('/tmp/cgs_'+tmp_filename+'.avsc')
    os.remove('/tmp/cgs_'+tmp_filename+'.vcf')
    os.remove('/tmp/cgs_'+tmp_filename+'.json')
    os.remove('/tmp/cgs_'+tmp_filename+'.avro')
    os.remove('/tmp/cgs_'+tmp_filename+'.hbase')

    return True