Example #1
0
def vcf_import_worker(queue, file_id, samples):
    while True:
        query = queue.get()
        if query is None:
            break

        Model.execute(query)
        queue.task_done()
Example #2
0
    def create_annotation_db(reference_id, reference_name, table_name,
                             vcf_annotation_metadata):
        """
            Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method
        """
        # Create annotation table
        pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata[
            'db_type'] == 'transcript' else ''
        pk2 = ',transcript_id' if vcf_annotation_metadata[
            'db_type'] == 'transcript' else ''
        pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 + "));"
        query = ""
        db_map = {}
        fields = []
        for col in vcf_annotation_metadata['columns']:
            col_name = normalise_annotation_name(col)
            fields.append("{} text".format(col_name))
            db_map[col_name] = {
                'name': col_name,
                'type': 'string',
                'name_ui': col
            }  # By default, create a table with only text field. Type can be changed by user via a dedicated UI
        query += pattern.format(table_name, ', '.join(fields))
        query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(
            table_name)
        query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(
            table_name)
        if vcf_annotation_metadata['db_type'] == 'transcript':
            query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format(
                table_name)

        # Register annotation
        db_uid, pk_uid = Model.execute(
            "SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(
                table_name,
                normalise_annotation_name(
                    vcf_annotation_metadata['db_pk_field']))).first()
        query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES "
        query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format(  # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations
            db_uid, reference_id, table_name,
            vcf_annotation_metadata['version'],
            vcf_annotation_metadata['name'],
            vcf_annotation_metadata['description'], 30,
            vcf_annotation_metadata['db_type'], pk_uid)

        query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES "
        for idx, f in enumerate(vcf_annotation_metadata['columns']):
            query += "('{0}', {1}, '{2}', '{3}', 'string'),".format(
                db_uid, idx, normalise_annotation_name(f), f)
        Model.execute(query[:-1])
        Model.execute(
            "UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;"
        )
        return db_uid, db_map
Example #3
0
 def delete(self, project_id):
     """ 
         Delete the project
         All its analyses are put into the trash project (id = 0)
     """
     project = Model.Project.from_id(project_id)
     if not project:
         raise RegovarException(code="E102001", arg=[project_id])
     sql = "UPDATE analysis SET project_id=0 WHERE project_id={0}; DELETE FROM project WHERE id={0}".format(
         project.id)
     result = project.to_json()
     Model.execute(sql)
     return result
Example #4
0
    def init(self, headers, reference_id):
        """
            Check VCF headers and return true if SnpEff data can be imported; false otherwise
            By the way, when SnpEff data are here, init internal data of the importer
        """
        result = False
        
        if 'SnpEffVersion' in headers.keys() :
            vcf_flag = None
            if 'EFF' in headers['INFO'].keys():
                vcf_flag = 'EFF'
                err("TODO: Old SnpEff annotation (EFF) importation is not implemented")
                
            elif 'ANN' in headers['INFO'].keys():
                vcf_flag = 'ANN'
                reference_name = Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first()[0]
                data = headers['INFO'][vcf_flag]['description'].split('Functional annotations:')
                self.name = "SnpEff"
                self.reference_id = reference_id
                self.description = "SnpEff variant annotation and effect prediction tool."
                self.columns = [self.normalise_annotation_name(c).title() for c in data[1].strip().strip("'").split('|')]
                self.version = headers['SnpEffVersion'][0].strip().strip('"').split(' ')[0]
                self.table_name = self.normalise_annotation_name('{}_{}_{}'.format('SnpEff', self.version, reference_name))
                self.vcf_flag = vcf_flag
                self.columns_definitions = SnpEffImporter.columns_definitions
                result = 'Feature_Id' in self.columns
                
                
                
        if result:
            self.check_annotation_table()

        print("SnpEff init : ", result)
        return result
Example #5
0
    def delete(self, project_id, author_id=None):
        """ 
            Delete the project
            All its analyses are put into the trash project (id = 0)
        """
        from core.core import core
        project = Model.Project.from_id(project_id)
        if not project:
            raise RegovarException(code="E102001", arg=[project_id])
        sql = "UPDATE analysis SET project_id=0 WHERE project_id={0}; DELETE FROM project WHERE id={0}".format(
            project.id)
        result = project.to_json()
        Model.execute(sql)

        core.events.log(author_id, "info", {"project_id": project.id},
                        "Project moved to trash: {}.".format(project.name))
        return result
Example #6
0
    def create_annotation_db(reference_id, reference_name, table_name, vcf_annotation_metadata):
        """
            Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method
        """
        # Create annotation table
        pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata['db_type'] == 'transcript' else ''
        pk2 = ',transcript_id' if vcf_annotation_metadata['db_type'] == 'transcript' else ''
        pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 +"));"
        query   = ""
        db_map = {}
        fields = []
        for col in vcf_annotation_metadata['columns']:
            col_name = normalise_annotation_name(col)
            fields.append("{} text".format(col_name))
            db_map[col_name] = { 'name' : col_name, 'type' : 'string', 'name_ui' : col }  # By default, create a table with only text field. Type can be changed by user via a dedicated UI
        query += pattern.format(table_name, ', '.join(fields))
        query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(table_name)
        query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(table_name)
        if vcf_annotation_metadata['db_type'] == 'transcript':
            query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format(table_name)

        # Register annotation
        db_uid, pk_uid = Model.execute("SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(table_name, normalise_annotation_name(vcf_annotation_metadata['db_pk_field']))).first()
        query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES "
        query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format( # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations
            db_uid, 
            reference_id, 
            table_name, 
            vcf_annotation_metadata['version'], 
            vcf_annotation_metadata['name'], 
            vcf_annotation_metadata['description'], 
            30, 
            vcf_annotation_metadata['db_type'],
            pk_uid)  

        query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES "
        for idx, f in enumerate(vcf_annotation_metadata['columns']):
            query += "('{0}', {1}, '{2}', '{3}', 'string'),".format(db_uid, idx, normalise_annotation_name(f), f)
        Model.execute(query[:-1])
        Model.execute("UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;")
        return db_uid, db_map
Example #7
0
    def init(self, headers, reference_id):
        """
            Check VCF headers and return true if SnpEff data can be imported; false otherwise
            By the way, when SnpEff data are here, init internal data of the importer
        """
        result = False

        if 'SnpEffVersion' in headers.keys():
            vcf_flag = None
            if 'EFF' in headers['INFO'].keys():
                vcf_flag = 'EFF'
                err("TODO: Old SnpEff annotation (EFF) importation is not implemented"
                    )

            elif 'ANN' in headers['INFO'].keys():
                vcf_flag = 'ANN'
                reference_name = Model.execute(
                    "SELECT table_suffix FROM reference WHERE id={}".format(
                        reference_id)).first()[0]
                data = headers['INFO'][vcf_flag]['description'].split(
                    'Functional annotations:')
                self.name = "SnpEff"
                self.reference_id = reference_id
                self.description = "SnpEff variant annotation and effect prediction tool."
                self.columns = [
                    self.normalise_annotation_name(c).title()
                    for c in data[1].strip().strip("'").split('|')
                ]
                self.version = headers['SnpEffVersion'][0].strip().strip(
                    '"').split(' ')[0]
                self.table_name = self.normalise_annotation_name(
                    '{}_{}_{}'.format('SnpEff', self.version, reference_name))
                self.vcf_flag = vcf_flag
                self.columns_definitions = SnpEffImporter.columns_definitions
                result = 'Feature_Id' in self.columns

        if result:
            self.check_annotation_table()

        print("SnpEff init : ", result)
        return result
Example #8
0
    def init(self, headers, reference_id):
        """
            Check VCF headers and return true if VEP data can be imported; false otherwise
            By the way, when VEP data are here, init internal data of the importer
        """
        result = False

        if 'VEP' in headers.keys():
            vcf_flag = None
            if 'CSQ' in headers['INFO'].keys():
                vcf_flag = 'CSQ'
            elif 'ANN' in headers['INFO'].keys():
                vcf_flag = 'ANN'

            if vcf_flag:
                reference_name = Model.execute(
                    "SELECT table_suffix FROM reference WHERE id={}".format(
                        reference_id)).first()[0]

                data = headers['INFO'][vcf_flag]['description'].split(
                    'Format:')
                self.name = "VEP"
                self.reference_id = reference_id
                self.description = data[0].strip()
                self.columns = [
                    self.normalise_annotation_name(c).title()
                    for c in data[1].strip().split('|')
                ]
                self.version = headers['VEP'][0].split(' ')[0]
                self.table_name = self.normalise_annotation_name(
                    '{}_{}_{}'.format('VEP', self.version, reference_name))
                self.vcf_flag = vcf_flag
                self.columns_definitions = VepImporter.columns_definitions
                result = 'Feature' in self.columns

                if result:
                    self.check_annotation_table()

        print("VEP init : ", result)
        return result
Example #9
0
def prepare_annotation_db(reference_id, vcf_annotation_metadata):
    """
        Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema
    """

    reference  = Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first()[0]
    table_name = normalise_annotation_name('{}_{}_{}'.format(vcf_annotation_metadata['name'], vcf_annotation_metadata['version'], reference))
    
    # Get database schema (if available)
    table_cols = {}
    db_uid     = Model.execute("SELECT uid FROM annotation_database WHERE name='{}'".format(table_name)).first()

    if db_uid is None:
        # No table in db for these annotation : create new table
        db_uid, table_cols = create_annotation_db(reference_id, reference, table_name, vcf_annotation_metadata)
    else:
        db_uid = db_uid[0]
        # Table already exists : retrieve columns already defined
        for col in Model.execute("SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'".format(db_uid)):
            table_cols[col.name] = {'name': col.name, 'type': col.type, 'name_ui': col.name_ui}
    # Get diff between columns in vcf and columns in DB, and update DB schema
    diff = []
    for col in vcf_annotation_metadata['columns']:
        if normalise_annotation_name(col) not in table_cols.keys():
            diff.append(col)
    if len(diff) > 0 :
        offset = len(vcf_annotation_metadata['columns'])
        query = ""
        for idx, col in enumerate(diff):
            name=normalise_annotation_name(col)
            query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format(table_name, name, db_uid, offset + idx, col)
            table_cols[name] = {'name': name, 'type': 'string', 'name_ui': col}

        # execute query
        Model.execute(query)
    # Update vcf_annotation_metadata with database mapping
    db_pk_field_uid = Model.execute("SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'".format(db_uid)).first().db_pk_field_uid
    vcf_annotation_metadata.update({'table': table_name, 'db_uid': db_uid, 'db_pk_field_uid': db_pk_field_uid})
    vcf_annotation_metadata['db_map'] = {}
    for col in vcf_annotation_metadata['columns']:
        vcf_annotation_metadata['db_map'][col] = table_cols[normalise_annotation_name(col)]
    return vcf_annotation_metadata
Example #10
0
    def import_delegate(self, file_id, vcf_reader, reference_id, db_ref_suffix,
                        vcf_metadata, samples):
        """
            This delegate will do the "real" import.
            It will be called by the "import_data" method in a new thread in order to don't block the main thread
        """
        from core.core import core
        # parsing vcf file
        records_count = vcf_metadata['count']
        records_current = 0
        vcf_line = vcf_metadata['header_count']
        table = "variant" + db_ref_suffix

        sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}])  WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';"
        sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, vcf_line, bin, chr, pos, ref, alt, genotype, depth, depth_alt, quality, filter) SELECT {0}, id, {1}, {2}, '{3}', {4}, '{5}', '{6}', {7}, {8}, {9}, {10}, '{11}' FROM variant" + db_ref_suffix + " WHERE bin={2} AND chr={3} AND pos={4} AND ref='{5}' AND alt='{6}' ON CONFLICT (sample_id, variant_id) DO NOTHING;"

        sql_annot_trx = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, regovar_trx_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, regovar_trx_id) DO  NOTHING; "  # TODO : do update on conflict
        sql_annot_var = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, {1}) SELECT id, {3},{4},{5},'{6}','{7}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id) DO  NOTHING;"

        sql_query1 = ""
        sql_query2 = ""
        sql_query3 = ""
        count = 0

        for row in vcf_reader:
            records_current += 1
            vcf_line += 1
            #log("> {} : {}".format(records_current, count))
            #if records_current == 14356:
            #ipdb.set_trace()

            # TODO : update sample's progress indicator

            chrm = normalize_chr(str(row.chrom))

            for allele in row.alleles:
                pos, ref, alt = normalise(row.pos, row.ref, allele)
                bin = getMaxUcscBin(pos, pos + len(ref))

                # get list of sample that have this variant (chr-pos-ref-alt)
                samples_array = []
                for sn in row.samples:
                    sp = row.samples.get(sn)
                    if allele in sp.alleles:
                        samples_array.append(samples[sp.name]["id"])
                if len(samples_array) == 0: continue
                # save variant
                samples_array = ",".join([str(s) for s in samples_array])
                sql_query1 += sql_pattern1.format(table, chrm, pos, ref, alt,
                                                  is_transition(ref, alt), bin,
                                                  samples_array)

                # Register variant/sample associations
                for sn in row.samples:
                    sp = row.samples.get(sn)
                    gt = normalize_gt(sp)
                    filters = escape_value_for_sql(
                        json.dumps(row.filter.keys()))
                    count += 1
                    if allele in sp.alleles:
                        if "AD" in sp.keys():
                            # Get allelic depth if exists (AD field)
                            depth_alt = sp["AD"][sp.alleles.index(allele)]
                        elif "DP4" in sp.keys():
                            if gt == 0:
                                depth_alt = sum(sp["DP4"])
                            else:
                                depth_alt = sp["DP4"][2] + sp["DP4"][
                                    3] if alt != ref else sp["DP4"][0] + sp[
                                        "DP4"][1]
                        else:
                            depth_alt = "NULL"

                        sql_query2 += sql_pattern2.format(
                            samples[sn]["id"],
                            vcf_line, bin, chrm, pos, ref, alt, gt,
                            get_info(sp, "DP"), depth_alt, row.qual, filters)
                    else:
                        # save that the sample HAVE NOT this variant
                        sql_query2 += sql_pattern2.format(
                            samples[sn]["id"],
                            vcf_line, bin, chrm, pos, ref, alt, "NULL",
                            get_info(sp, "DP"), "NULL", row.qual, filters)

                # Register variant annotations
                for ann_name, importer in vcf_metadata["annotations"].items():
                    if importer:
                        importer_query, importer_count = importer.import_annotations(
                            sql_annot_trx, bin, chrm, pos, ref, alt, row.info)
                        sql_query3 += importer_query
                        count += importer_count

            # split big request to avoid sql out of memory transaction or too long freeze of the server
            if count >= 5000:
                progress = records_current / records_count
                count = 0
                transaction = sql_query1 + sql_query2 + sql_query3
                log("VCF import : line {} (chrm {})".format(
                    records_current, chrm))
                log("VCF import : Execute sync query {}/{} ({}%)".format(
                    records_current, records_count, round(progress * 100, 2)))

                # update sample's progress indicator
                # note : as we are updating lot of data in the database with several asynch thread
                #        so to avoid conflict with session, we update data from "manual query"
                sps = []
                sql = "UPDATE sample SET loading_progress={} WHERE id IN ({})".format(
                    progress,
                    ",".join([str(samples[sid]["id"]) for sid in samples]))
                Model.execute(sql)
                core.notify_all({
                    "action": "import_vcf_processing",
                    "data": {
                        "reference_id":
                        reference_id,
                        "file_id":
                        file_id,
                        "status":
                        "loading",
                        "progress":
                        progress,
                        "samples": [{
                            "id": samples[sname]["id"],
                            "name": sname
                        } for sname in samples]
                    }
                })

                log("VCF import : enqueue query")
                self.queue.put(transaction)
                # Reset query buffers
                sql_query1 = ""
                sql_query2 = ""
                sql_query3 = ""

        # Loop done, execute last pending query
        log("VCF import : Execute last async query")
        transaction = sql_query1 + sql_query2 + sql_query3
        if transaction:
            self.queue.put(transaction)

        # Waiting that all query in the queue was executed
        log("VCF parsing done. Waiting for async execution of sql queries")

        # block until all tasks are done
        self.queue.join()
        log("No more sql query to proceed")

        # stop vcf_import_thread_workers
        for i in range(VCF_IMPORT_MAX_THREAD):
            self.queue.put(None)
        for t in self.workers:
            t.join()

        # Compute composite variant by sample
        sql_pattern = "UPDATE sample_variant" + db_ref_suffix + " u SET is_composite=TRUE WHERE u.sample_id = {0} AND u.variant_id IN (SELECT DISTINCT UNNEST(sub.vids) as variant_id FROM (SELECT array_agg(v.variant_id) as vids, g.name2 FROM sample_variant" + db_ref_suffix + " v INNER JOIN refgene" + db_ref_suffix + " g ON g.chr=v.chr AND g.trxrange @> v.pos WHERE v.sample_id={0} AND v.genotype=2 or v.genotype=3 GROUP BY name2 HAVING count(*) > 1) AS sub)"
        log("Computing is_composite fields by samples :")
        for sid in samples:
            query = sql_pattern.format(samples[sid]["id"])
            log(" - sample {}".format(samples[sid]["id"]))
            Model.execute(query)
        log("Sample import from VCF Done")
        end = datetime.datetime.now()

        # update sample's progress indicator
        Model.execute(
            "UPDATE sample SET status='ready', loading_progress=1  WHERE id IN ({})"
            .format(",".join([str(samples[sid]["id"]) for sid in samples])))

        core.notify_all({
            "action": "import_vcf_end",
            "data": {
                "reference_id":
                reference_id,
                "file_id":
                file_id,
                "msg":
                "Import done without error.",
                "samples": [{
                    "id": samples[s]["id"],
                    "name": samples[s]["name"]
                } for s in samples.keys()]
            }
        })

        # When import is done, check if analysis are waiting for creation and then start wt creation if all sample are ready
        # TODO
        sql = "SELECT DISTINCT(analysis_id) FROM analysis_sample WHERE sample_id IN ({})".format(
            ",".join([str(samples[sid]["id"]) for sid in samples]))
        for row in Model.execute(sql):
            analysis = Model.Analysis.from_id(row.analysis_id, 1)
            if analysis.status == "waiting":
                log("Auto initialisation of the analysis in witing state : {} ({})"
                    .format(analysis.name, analysis.id))
                core.filters.request(analysis.id, analysis.filter,
                                     analysis.fields)
Example #11
0
def prepare_annotation_db(reference_id, vcf_annotation_metadata):
    """
        Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema
    """

    reference = Model.execute(
        "SELECT table_suffix FROM reference WHERE id={}".format(
            reference_id)).first()[0]
    table_name = normalise_annotation_name('{}_{}_{}'.format(
        vcf_annotation_metadata['name'], vcf_annotation_metadata['version'],
        reference))

    # Get database schema (if available)
    table_cols = {}
    db_uid = Model.execute(
        "SELECT uid FROM annotation_database WHERE name='{}'".format(
            table_name)).first()

    if db_uid is None:
        # No table in db for these annotation : create new table
        db_uid, table_cols = create_annotation_db(reference_id, reference,
                                                  table_name,
                                                  vcf_annotation_metadata)
    else:
        db_uid = db_uid[0]
        # Table already exists : retrieve columns already defined
        for col in Model.execute(
                "SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'"
                .format(db_uid)):
            table_cols[col.name] = {
                'name': col.name,
                'type': col.type,
                'name_ui': col.name_ui
            }
    # Get diff between columns in vcf and columns in DB, and update DB schema
    diff = []
    for col in vcf_annotation_metadata['columns']:
        if normalise_annotation_name(col) not in table_cols.keys():
            diff.append(col)
    if len(diff) > 0:
        offset = len(vcf_annotation_metadata['columns'])
        query = ""
        for idx, col in enumerate(diff):
            name = normalise_annotation_name(col)
            query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format(
                table_name, name, db_uid, offset + idx, col)
            table_cols[name] = {'name': name, 'type': 'string', 'name_ui': col}

        # execute query
        Model.execute(query)
    # Update vcf_annotation_metadata with database mapping
    db_pk_field_uid = Model.execute(
        "SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'".
        format(db_uid)).first().db_pk_field_uid
    vcf_annotation_metadata.update({
        'table': table_name,
        'db_uid': db_uid,
        'db_pk_field_uid': db_pk_field_uid
    })
    vcf_annotation_metadata['db_map'] = {}
    for col in vcf_annotation_metadata['columns']:
        vcf_annotation_metadata['db_map'][col] = table_cols[
            normalise_annotation_name(col)]
    return vcf_annotation_metadata
Example #12
0
    async def import_data(self, file_id, **kargs):
        """
            Import samples, variants and annotations from the provided file.
            This method check provided parameters and parse the header of the vcf to get samples and compute the number of line
            that need to be parse to allow us to compute a progress indicator. The parsing is done in delegate called in another thread.
            Return the list of sample that have been added.
        """
        from core.core import core
        file = Model.File.from_id(file_id)
        filepath = file.path
        reference_id = kargs["reference_id"]
        start_0 = datetime.datetime.now()
        job_in_progress = []

        
        vcf_metadata = prepare_vcf_parsing(reference_id, filepath)
        db_ref_suffix= "_" + Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first().table_suffix

        if vcf_metadata:
            filepath += ".regovar_import" # a tmp file have been created by prepare_vcf_parsing() method to avoid pysam unsupported file format.
            start = datetime.datetime.now()
            
            # Create vcf parser
            vcf_reader = VariantFile(filepath)

            # get samples in the VCF 
            # samples = {i : Model.get_or_create(Model.Session(), Model.Sample, name=i)[0] for i in list((vcf_reader.header.samples))}
            samples = {}
            for i in vcf_reader.header.samples:
                sample = Model.Sample.new()
                sample.name = i
                sample.file_id = file_id
                sample.reference_id = reference_id
                sample.filter_description = {filter[0]:filter[1].description for filter in vcf_reader.header.filters.items()}
                sample.default_dbuid = []
                sample.status = "loading"
                for dbname in vcf_metadata["annotations"].keys():
                    if vcf_metadata["annotations"][dbname]:
                        sample.default_dbuid.append(vcf_metadata["annotations"][dbname].db_uid)
                # TODO : is_mosaic according to the data in the vcf
                sample.save()
                
                # As these sample will be shared with other threads, we remove them from the sql session to avoid error
                samples.update({i : sample.to_json()})
                
            if len(samples.keys()) == 0 : 
                war("VCF files without sample cannot be imported in the database.")
                await core.notify_all_co({"action": "import_vcf_error", "data" : {"reference_id": reference_id, "file_id" : file_id, "msg" : "VCF files without sample cannot be imported in the database."}})
                return;


            # # tasks queue shared by all thread
            # self.queue = Queue(maxsize=0)
            # # list of worker created to execute multithread tasks
            # self.workers = []
            
            # # init threading workers
            # for i in range(VCF_IMPORT_MAX_THREAD):
            #     t = Thread(target=vcf_import_worker, args=(self.queue, file_id, samples), daemon=True)
            #     t.start()
            #     self.workers.append(t)


            await core.notify_all_co({"action":"import_vcf_start", "data" : {"reference_id": reference_id, "file_id" : file_id, "samples" : [ {"id" : samples[sid]["id"], "name" : samples[sid]["name"]} for sid in samples.keys()]}})
            records_count = vcf_metadata["count"]
            log ("Importing file {0}\n\r\trecords  : {1}\n\r\tsamples  :  ({2}) {3}\n\r\tstart    : {4}".format(filepath, records_count, len(samples.keys()), reprlib.repr([sid for sid in samples.keys()]), start))
            
            run_async(self.import_delegate, file_id, vcf_reader, reference_id, db_ref_suffix, vcf_metadata, samples)
        
            return {"success": True, "samples": samples, "records_count": records_count }
        return {"success": False, "error": "File not supported"}
Example #13
0
    def import_delegate(self, file_id, vcf_reader, reference_id, db_ref_suffix, vcf_metadata, samples):
        """
            This delegate will do the "real" import.
            It will be called by the "import_data" method in a new thread in order to don't block the main thread
        """
        from core.core import core
        # parsing vcf file
        records_count = vcf_metadata['count']
        records_current = 0
        vcf_line = vcf_metadata['header_count']
        table = "variant" + db_ref_suffix
        
        sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}])  WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';"
        sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, vcf_line, bin, chr, pos, ref, alt, genotype, depth, depth_alt, quality, filter) SELECT {0}, id, {1}, {2}, '{3}', {4}, '{5}', '{6}', {7}, {8}, {9}, {10}, '{11}' FROM variant" + db_ref_suffix + " WHERE bin={2} AND chr={3} AND pos={4} AND ref='{5}' AND alt='{6}' ON CONFLICT (sample_id, variant_id) DO NOTHING;"
        
        sql_annot_trx = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, regovar_trx_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, regovar_trx_id) DO  NOTHING; " # TODO : do update on conflict
        sql_annot_var = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, {1}) SELECT id, {3},{4},{5},'{6}','{7}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id) DO  NOTHING;"

        sql_query1 = ""
        sql_query2 = ""
        sql_query3 = ""
        count = 0
        
        for row in vcf_reader: 
            records_current += 1 
            vcf_line += 1
            #log("> {} : {}".format(records_current, count))
            #if records_current == 14356:
                #ipdb.set_trace()
                    
            # TODO : update sample's progress indicator
            
            
            chrm = normalize_chr(str(row.chrom))
            
            for allele in row.alleles:
                pos, ref, alt = normalise(row.pos, row.ref, allele)
                bin = getMaxUcscBin(pos, pos + len(ref))
                
                # get list of sample that have this variant (chr-pos-ref-alt)
                samples_array = []
                for sn, sp in row.samples.items():
                    if allele in sp.alleles:
                        samples_array.append(samples[sp.name]["id"])
                if len(samples_array) == 0: continue
                # save variant
                samples_array = ",".join([str(s) for s in samples_array])
                sql_query1 += sql_pattern1.format(table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array)
                        
                # Register variant/sample associations
                for sn, sp in row.samples.items():
                    gt = normalize_gt(sp)
                    filters = escape_value_for_sql(json.dumps(row.filter.keys()))
                    count += 1
                    if allele in sp.alleles:
                        if "AD" in sp.keys():
                            # Get allelic depth if exists (AD field)
                            depth_alt = sp["AD"][sp.alleles.index(allele)] 
                        elif "DP4" in sp.keys():
                            if gt == 0:
                                depth_alt = sum(sp["DP4"])
                            else:
                                depth_alt = sp["DP4"][2] + sp["DP4"][3] if alt != ref else sp["DP4"][0] + sp["DP4"][1]
                        else :
                            depth_alt = "NULL"
                        
                        sql_query2 += sql_pattern2.format(samples[sn]["id"], vcf_line, bin, chrm, pos, ref, alt, gt, get_info(sp, "DP"), sqlc(depth_alt), sqlc(row.qual), filters)
                    else:
                        # save that the sample HAVE NOT this variant
                        sql_query2 += sql_pattern2.format(samples[sn]["id"], vcf_line, bin, chrm, pos, ref, alt, "NULL", get_info(sp, "DP"), "NULL", sqlc(row.qual), filters)
                
                # Register variant annotations
                for ann_name, importer in vcf_metadata["annotations"].items():
                    if importer:
                        importer_query, importer_count = importer.import_annotations(sql_annot_trx, bin, chrm, pos, ref, alt, row.info)
                        sql_query3 += importer_query
                        count += importer_count
                        
                            


            # split big request to avoid sql out of memory transaction or too long freeze of the server
            if count >= 1000:
                progress = records_current / records_count
                count = 0
                transaction = "BEGIN; " + sql_query1 + sql_query2 + sql_query3 + "COMMIT; "
                log("VCF import : line {} (chrm {})".format(records_current, chrm))
                log("VCF import : Execute sync query {}/{} ({}%)".format(records_current, records_count, round(progress * 100, 2)))
                
                    
                # update sample's progress indicator
                # note : as we are updating lot of data in the database with several asynch thread
                #        so to avoid conflict with session, we update data from "manual query"
                sps = []
                sql = "UPDATE sample SET loading_progress={} WHERE id IN ({})".format(progress, ",".join([str(samples[sid]["id"]) for sid in samples]))
                Model.execute(sql)
                core.notify_all({"action": "import_vcf_processing", "data" : {"reference_id": reference_id, "file_id" : file_id, "status" : "loading", "progress": progress, "samples": [ {"id" : samples[sname]["id"], "name" : sname} for sname in samples]}})
                
                #log("VCF import : enqueue query")
                #self.queue.put(transaction)
                log("VCF import : execute query")
                Model.execute(transaction)
                # Reset query buffers
                sql_query1 = ""
                sql_query2 = ""
                sql_query3 = ""

        # # Loop done, execute last pending query 
        # log("VCF import : Execute last async query")
        # transaction = sql_query1 + sql_query2 + sql_query3
        # if transaction:
        #     self.queue.put(transaction)


        # # Waiting that all query in the queue was executed
        # log("VCF parsing done. Waiting for async execution of sql queries")
        
        # # block until all tasks are done
        # self.queue.join()
        # log("No more sql query to proceed")
        
        # # stop vcf_import_thread_workers
        # for i in range(VCF_IMPORT_MAX_THREAD):
        #     self.queue.put(None)
        # for t in self.workers:
        #     t.join()

        # Compute composite variant by sample
        sql_pattern = "UPDATE sample_variant" + db_ref_suffix + " u SET is_composite=TRUE WHERE u.sample_id = {0} AND u.variant_id IN (SELECT DISTINCT UNNEST(sub.vids) as variant_id FROM (SELECT array_agg(v.variant_id) as vids, g.name2 FROM sample_variant" + db_ref_suffix + " v INNER JOIN refgene" + db_ref_suffix + " g ON g.chr=v.chr AND g.trxrange @> v.pos WHERE v.sample_id={0} AND v.genotype=2 or v.genotype=3 GROUP BY name2 HAVING count(*) > 1) AS sub)"
        log("Computing is_composite fields by samples :")
        # for sid in samples:
        #     query = sql_pattern.format(samples[sid]["id"])
        #     log(" - sample {}".format(samples[sid]["id"]))
        #     Model.execute(query)
        log("Sample import from VCF Done")
        end = datetime.datetime.now()
        
        # update sample's progress indicator
        Model.execute("UPDATE sample SET status='ready', loading_progress=1  WHERE id IN ({})".format(",".join([str(samples[sid]["id"]) for sid in samples])))
        
        core.notify_all({"action": "import_vcf_end", "data" : {"reference_id": reference_id, "file_id" : file_id, "msg" : "Import done without error.", "samples": [ {"id" : samples[s]["id"], "name" : samples[s]["name"]} for s in samples.keys()]}})


        # When import is done, check if analysis are waiting for creation and then start wt creation if all sample are ready 
        # TODO
        sql = "SELECT DISTINCT(analysis_id) FROM analysis_sample WHERE sample_id IN ({})".format(",".join([str(samples[sid]["id"]) for sid in samples]))
        for row in Model.execute(sql):
            analysis = Model.Analysis.from_id(row.analysis_id,1)
            if analysis.status == "waiting":
                log("Auto initialisation of the analysis in witing state : {} ({})".format(analysis.name, analysis.id))
                core.filters.request(analysis.id, analysis.filter, analysis.fields)
Example #14
0
version = sys.argv[2]
#hpopath = "/var/regovar/databases/"
#version = "2018-03-09 09:06"

print(version)

# create path to hpo files to import
obopath = hpopath + "hpo.obo"
annotpath = hpopath + "hpo_annotation.txt"
nannotpath = hpopath + "hpo_annotation_neg.txt"
diseapath = hpopath + "hpo_disease.txt"
phenopath = hpopath + "hpo_phenotype.txt"

# Clear HPO tables
print('Clear database: ', end='', flush=True)
Model.execute("DELETE FROM hpo_phenotype")
Model.execute("DELETE FROM hpo_disease")
print('Done')

# temp dict that store direct child relation between a term and all its childs
p_data = {}  # phenotype oriented data
d_data = {}  # disease oriented data


# TOOLS
def escape(value):
    if type(value) is str:
        value = value.replace('%%', '%')
        value = value.replace("'", "''")
    return value
Example #15
0
    async def import_data(self, file_id, **kargs):
        """
            Import samples, variants and annotations from the provided file.
            This method check provided parameters and parse the header of the vcf to get samples and compute the number of line
            that need to be parse to allow us to compute a progress indicator. The parsing is done in delegate called in another thread.
            Return the list of sample that have been added.
        """
        from core.core import core
        file = Model.File.from_id(file_id)
        filepath = file.path
        reference_id = kargs["reference_id"]
        start_0 = datetime.datetime.now()
        job_in_progress = []

        vcf_metadata = prepare_vcf_parsing(reference_id, filepath)
        db_ref_suffix = "_" + Model.execute(
            "SELECT table_suffix FROM reference WHERE id={}".format(
                reference_id)).first().table_suffix

        if filepath.endswith(".vcf") or filepath.endswith(".vcf.gz"):
            filepath += ".regovar_import"  # a tmp file have been created by prepare_vcf_parsing() method to avoid pysam unsupported file format.
            start = datetime.datetime.now()

            # Create vcf parser
            vcf_reader = VariantFile(filepath)

            # get samples in the VCF
            # samples = {i : Model.get_or_create(Model.Session(), Model.Sample, name=i)[0] for i in list((vcf_reader.header.samples))}
            samples = {}
            for i in list((vcf_reader.header.samples)):
                sample = Model.Sample.new()
                sample.name = i
                sample.file_id = file_id
                sample.reference_id = reference_id
                sample.filter_description = {
                    filter[0]: filter[1].description
                    for filter in vcf_reader.header.filters.items()
                }
                sample.default_dbuid = []
                sample.status = "loading"
                for dbname in vcf_metadata["annotations"].keys():
                    if vcf_metadata["annotations"][dbname]:
                        sample.default_dbuid.append(
                            vcf_metadata["annotations"][dbname].db_uid)
                # TODO : is_mosaic according to the data in the vcf
                sample.save()

                # As these sample will be shared with other threads, we remove them from the sql session to avoid error
                samples.update({i: sample.to_json()})

            if len(samples.keys()) == 0:
                war("VCF files without sample cannot be imported in the database."
                    )
                core.notify_all({
                    "action": "import_vcf_error",
                    "data": {
                        "reference_id":
                        reference_id,
                        "file_id":
                        file_id,
                        "msg":
                        "VCF files without sample cannot be imported in the database."
                    }
                })
                return

            # tasks queue shared by all thread
            self.queue = Queue(maxsize=0)
            # list of worker created to execute multithread tasks
            self.workers = []

            # init threading workers
            for i in range(VCF_IMPORT_MAX_THREAD):
                t = Thread(target=vcf_import_worker,
                           args=(self.queue, file_id, samples),
                           daemon=True)
                t.start()
                self.workers.append(t)

            core.notify_all({
                "action": "import_vcf_start",
                "data": {
                    "reference_id":
                    reference_id,
                    "file_id":
                    file_id,
                    "samples": [{
                        "id": samples[sid]["id"],
                        "name": samples[sid]["name"]
                    } for sid in samples.keys()]
                }
            })
            records_count = vcf_metadata["count"]
            log("Importing file {0}\n\r\trecords  : {1}\n\r\tsamples  :  ({2}) {3}\n\r\tstart    : {4}"
                .format(filepath, records_count, len(samples.keys()),
                        reprlib.repr([sid for sid in samples.keys()]), start))

            run_async(self.import_delegate, file_id, vcf_reader, reference_id,
                      db_ref_suffix, vcf_metadata, samples)

            return {
                "success": True,
                "samples": samples,
                "records_count": records_count
            }
        return {"success": False, "error": "File not supported"}
Example #16
0
async def import_data(file_id, filepath, core=None, reference_id=2):
    import ipdb

    import os
    import datetime
    import sqlalchemy
    import subprocess
    import multiprocessing as mp
    import reprlib
    import gzip
    from pysam import VariantFile

    from core.framework.common import log, war, err, RegovarException
    import core.model as Model

    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Tools
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

    def count_vcf_row(filename):
        """
            Use linux OS commands to quickly count variant to parse in the vcf file
        """
        bashCommand = 'grep -v "^#" ' + str(filename) + ' | wc -l'
        if filename.endswith("gz"):
            bashCommand = "z" + bashCommand
        process = subprocess.Popen(bashCommand,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
        cmd_out = process.communicate()[0]
        return int(cmd_out.decode('utf8'))

    def debug_clear_header(filename):
        """
            A workaround to fix a bug with GVCF header with pysam
            EDIT : in fact the problem to be that pysam do not support some kind of compression, so this command 
            is still used to rezip the vcf in a supported format.
        """
        bashCommand = "grep -v '^##GVCFBlock' {} | gzip --best > /var/regovar/downloads/tmp_workaround".format(
            filename)
        if filename.endswith("gz"):
            bashCommand = "z" + bashCommand
        process = subprocess.Popen(bashCommand,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)
        bashCommand = "mv /var/regovar/downloads/tmp_workaround  {} ".format(
            filename)
        process = subprocess.Popen(bashCommand,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.STDOUT)

    def prepare_vcf_parsing(filename):
        """
            Parse vf headers and return information about which data shall be parsed
            and stored in the database
        """
        # Extract headers
        debug_clear_header(filename)

        headers = {}
        samples = []
        _op = open
        if filename.endswith('gz') or filename.endswith('zip'):
            _op = gzip.open
        with _op(filename) as f:
            for line in f:
                if _op != open:
                    line = line.decode()
                if line.startswith('##'):
                    l = line[2:].strip()
                    l = [l[0:l.index('=')], l[l.index('=') + 1:]]
                    if l[0] not in headers.keys():
                        if l[0] == 'INFO':
                            headers[l[0]] = {}
                        else:
                            headers[l[0]] = []
                    if l[0] == 'INFO':
                        data = l[1][1:-1].split(',')
                        info_id = data[0][3:]
                        info_type = data[2][5:]
                        info_desc = data[3][13:-1]
                        headers['INFO'].update({
                            info_id: {
                                'type': info_type,
                                'description': info_desc
                            }
                        })
                    else:
                        headers[l[0]].append(l[1])
                elif line.startswith('#'):
                    samples = line[1:].strip().split('\t')[9:]
                else:
                    break

        # Check for VEP
        vep = {'vep': False}
        if 'VEP' in headers.keys() and 'CSQ' in headers['INFO'].keys():
            d = headers['INFO']['CSQ']['description'].split('Format:')
            vep = {
                'vep': {
                    'version': headers['VEP'][0].split(' ')[0],
                    'flag': 'CSQ',
                    'name': 'VEP',
                    'db_type': 'transcript',
                    'db_pk_field': 'Feature',
                    'description': d[0].strip(),
                    'columns': d[1].strip().split('|'),
                }
            }
            if 'Feature' not in vep['vep']['columns']:
                vep = {'vep': False}

        # Check for SnpEff
        snpeff = {'snpeff': False}
        if 'SnpEffVersion' in headers.keys():
            if 'ANN' in headers['INFO'].keys():
                # TODO
                pass
            elif 'EFF' in headers['INFO'].keys():
                d = headers['INFO']['EFF']['description'].split('\'')
                snpeff = {
                    'snpeff': {
                        'version':
                        headers['SnpEffVersion'][0].strip().strip('"').split(
                            ' ')[0],
                        'flag':
                        'EFF',
                        'name':
                        'SnpEff',
                        'db_type':
                        'transcript',
                        'db_pk_field':
                        'Transcript_ID',
                        'columns':
                        [c.strip() for c in d[1].strip().split('|')],
                        'description':
                        d[0].strip(),
                    }
                }
                if 'Transcript_ID' not in snpeff['snpeff']['columns']:
                    snpeff = {'snpeff': False}

        # Retrieve extension
        file_type = os.path.split(filename)[1].split('.')[-1]
        if not 'vcf' in file_type:
            file_type += os.path.split(filename)[1].split('.')[-2] + "."

        # Return result
        result = {
            'vcf_version': headers['fileformat'][0],
            'name': os.path.split(filename)[1],
            'count': count_vcf_row(filename),
            'size': os.path.getsize(filename),
            'type': file_type,
            'samples': samples,
            'annotations': {}
        }
        result['annotations'].update(vep)
        result['annotations'].update(snpeff)
        return result

    def normalise_annotation_name(name):
        """
            Tool to convert a name of a annotation tool/db/field/version into the corresponding valid name for the database
        """
        if name[0].isdigit():
            name = '_' + name

        def check_char(char):
            if char in ['.', '-', '_', '/']:
                return '_'
            elif char.isalnum():
                # TODO : remove accents
                return char.lower()
            else:
                return ''

        return ''.join(check_char(c) for c in name)

    def create_annotation_db(reference_id, reference_name, table_name,
                             vcf_annotation_metadata):
        """
            Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method
        """
        # Create annotation table
        pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata[
            'db_type'] == 'transcript' else ''
        pk2 = ',transcript_id' if vcf_annotation_metadata[
            'db_type'] == 'transcript' else ''
        pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 + "));"
        query = ""
        db_map = {}
        fields = []
        for col in vcf_annotation_metadata['columns']:
            col_name = normalise_annotation_name(col)
            fields.append("{} text".format(col_name))
            db_map[col_name] = {
                'name': col_name,
                'type': 'string',
                'name_ui': col
            }  # By default, create a table with only text field. Type can be changed by user via a dedicated UI
        query += pattern.format(table_name, ', '.join(fields))
        query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(
            table_name)
        query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(
            table_name)
        if vcf_annotation_metadata['db_type'] == 'transcript':
            query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format(
                table_name)

        # Register annotation
        db_uid, pk_uid = Model.execute(
            "SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(
                table_name,
                normalise_annotation_name(
                    vcf_annotation_metadata['db_pk_field']))).first()
        query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES "
        query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format(  # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations
            db_uid, reference_id, table_name,
            vcf_annotation_metadata['version'],
            vcf_annotation_metadata['name'],
            vcf_annotation_metadata['description'], 30,
            vcf_annotation_metadata['db_type'], pk_uid)

        query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES "
        for idx, f in enumerate(vcf_annotation_metadata['columns']):
            query += "('{0}', {1}, '{2}', '{3}', 'string'),".format(
                db_uid, idx, normalise_annotation_name(f), f)
        Model.execute(query[:-1])
        Model.execute(
            "UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;"
        )
        return db_uid, db_map

    def prepare_annotation_db(reference_id, vcf_annotation_metadata):
        """
            Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema
        """

        reference = Model.execute(
            "SELECT table_suffix FROM reference WHERE id={}".format(
                reference_id)).first()[0]
        table_name = normalise_annotation_name('{}_{}_{}'.format(
            vcf_annotation_metadata['flag'],
            vcf_annotation_metadata['version'], reference))

        # Get database schema (if available)
        table_cols = {}
        db_uid = Model.execute(
            "SELECT uid FROM annotation_database WHERE name='{}'".format(
                table_name)).first()

        if db_uid is None:
            # No table in db for these annotation : create new table
            db_uid, table_cols = create_annotation_db(reference_id, reference,
                                                      table_name,
                                                      vcf_annotation_metadata)
        else:
            db_uid = db_uid[0]
            # Table already exists : retrieve columns already defined
            for col in Model.execute(
                    "SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'"
                    .format(db_uid)):
                table_cols[col.name] = {
                    'name': col.name,
                    'type': col.type,
                    'name_ui': col.name_ui
                }
        # Get diff between columns in vcf and columns in DB, and update DB schema
        diff = []
        for col in vcf_annotation_metadata['columns']:
            if normalise_annotation_name(col) not in table_cols.keys():
                diff.append(col)
        if len(diff) > 0:
            offset = len(vcf_annotation_metadata['columns'])
            query = ""
            for idx, col in enumerate(diff):
                name = normalise_annotation_name(col)
                query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format(
                    table_name, name, db_uid, offset + idx, col)
                table_cols[name] = {
                    'name': name,
                    'type': 'string',
                    'name_ui': col
                }

            # execute query
            Model.execute(query)
        # Update vcf_annotation_metadata with database mapping
        db_pk_field_uid = Model.execute(
            "SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'".
            format(db_uid)).first().db_pk_field_uid
        vcf_annotation_metadata.update({
            'table': table_name,
            'db_uid': db_uid,
            'db_pk_field_uid': db_pk_field_uid
        })
        vcf_annotation_metadata['db_map'] = {}
        for col in vcf_annotation_metadata['columns']:
            vcf_annotation_metadata['db_map'][col] = table_cols[
                normalise_annotation_name(col)]
        return vcf_annotation_metadata

    def normalize_chr(chrm):
        """
            Normalize chromosome number from VCF format into Database format
        """
        chrm = chrm.upper()
        if chrm.startswith("CHROM"):
            chrm = chrm[5:]
        if chrm.startswith("CHRM") and chrm != "CHRM":
            chrm = chrm[4:]
        if chrm.startswith("CHR"):
            chrm = chrm[3:]

        if chrm == "X":
            chrm = 23
        elif chrm == "Y":
            chrm = 24
        elif chrm == "M":
            chrm = 25
        else:
            try:
                chrm = int(chrm)
            except Exception as error:
                # TODO log /report error
                chrm = None
        return chrm

    def normalize(pos, ref, alt):
        """
            Normalize given (position, ref and alt) from VCF into Database format
             - Assuming that position in VCF are 1-based (0-based in Database)
             - triming ref and alt to get minimal alt (and update position accordingly)
        """
        # input pos comming from VCF are 1-based.
        # to be consistent with UCSC databases we convert it into 0-based
        pos -= 1

        if (ref == alt):
            return None, None, None
        if ref is None:
            ref = ''
        if alt is None:
            alt = ''
        while len(ref) > 0 and len(alt) > 0 and ref[0] == alt[0]:
            ref = ref[1:]
            alt = alt[1:]
            pos += 1
        if len(ref) == len(alt):
            while ref[-1:] == alt[-1:]:
                ref = ref[0:-1]
                alt = alt[0:-1]
        return pos, ref, alt

    def normalize_gt(infos):
        """
            Normalize GT sample informatin from VCF format into Database format
        """
        gt = get_info(infos, 'GT')
        if gt != 'NULL':
            if infos['GT'][0] == infos['GT'][1]:
                # Homozyot ref
                if infos['GT'][0] in [None, 0]:
                    return 0
                # Homozyot alt
                return '1'
            else:
                if 0 in infos['GT']:
                    # Hetero ref
                    return '2'
                else:
                    return '3'
            log("unknow : " + str(infos['GT']))
        return -1

    def get_alt(alt):
        """
            Retrieve alternative values from VCF data
        """
        if ('|' in alt):
            return alt.split('|')
        else:
            return alt.split('/')

    def get_info(infos, key):
        """
            Retrieving info annotation from VCF data
        """
        if (key in infos):
            if infos[key] is None: return 'NULL'
            return infos[key]
        return 'NULL'

    def is_transition(ref, alt):
        """
            Return true if the variant is a transversion; false otherwise
        """
        tr = ref + alt
        if len(ref) == 1 and tr in ('AG', 'GA', 'CT', 'TC'):
            return True
        return False

    def escape_value_for_sql(value):
        if type(value) is str:
            value = value.replace('%', '%%')
            value = value.replace("'", "''")

        return value

    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Tiers code from vtools.  Bin index calculation
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

    #
    # Utility function to calculate bins.
    #
    # This function implements a hashing scheme that UCSC uses (developed by Jim Kent) to
    # take in a genomic coordinate range and return a set of genomic "bins" that your range
    # intersects.  I found a Java implementation on-line (I need to find the URL) and I
    # simply manually converted the Java code into Python code.

    # IMPORTANT: Because this is UCSC code the start coordinates are 0-based and the end
    # coordinates are 1-based!!!!!!

    # BINRANGE_MAXEND_512M = 512 * 1024 * 1024
    # binOffsetOldToExtended = 4681; #  (4096 + 512 + 64 + 8 + 1 + 0)

    _BINOFFSETS = (
        512 + 64 + 8 +
        1,  # = 585, min val for level 0 bins (128kb binsize)    
        64 + 8 + 1,  # =  73, min val for level 1 bins (1Mb binsize) 
        8 + 1,  # =   9, min val for level 2 bins (8Mb binsize)  
        1,  # =   1, min val for level 3 bins (64Mb binsize)  
        0)  # =   0, only val for level 4 bin (512Mb binsize)

    #    1:   0000 0000 0000 0001    1<<0
    #    8:   0000 0000 0000 1000    1<<3
    #   64:   0000 0000 0100 0000    1<<6
    #  512:   0000 0010 0000 0000    1<<9

    _BINFIRSTSHIFT = 17
    # How much to shift to get to finest bin.
    _BINNEXTSHIFT = 3
    # How much to shift to get to next larger bin.
    _BINLEVELS = len(_BINOFFSETS)

    #
    # IMPORTANT: the start coordinate is 0-based and the end coordinate is 1-based.
    #
    def getUcscBins(start, end):
        bins = []
        startBin = start >> _BINFIRSTSHIFT
        endBin = (end - 1) >> _BINFIRSTSHIFT
        for i in range(_BINLEVELS):
            offset = _BINOFFSETS[i]
            if startBin == endBin:
                bins.append(startBin + offset)
            else:
                for bin in range(startBin + offset, endBin + offset):
                    bins.append(bin)
            startBin >>= _BINNEXTSHIFT
            endBin >>= _BINNEXTSHIFT
        return bins

    def getMaxUcscBin(start, end):
        bin = 0
        startBin = start >> _BINFIRSTSHIFT
        endBin = (end - 1) >> _BINFIRSTSHIFT
        for i in range(_BINLEVELS):
            offset = _BINOFFSETS[i]
            if startBin == endBin:
                if startBin + offset > bin:
                    bin = startBin + offset
            else:
                for i in range(startBin + offset, endBin + offset):
                    if i > bin:
                        bin = i
            startBin >>= _BINNEXTSHIFT
            endBin >>= _BINNEXTSHIFT
        return bin

    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Import
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

    def transaction_end(job_id, result):
        job_in_progress.remove(job_id)
        if result is Exception or result is None:
            core.notify_all({
                'msg': 'import_vcf_end',
                'data': {
                    'file_id': file_id,
                    'msg': 'Error occured : ' + str(err)
                }
            })

    start_0 = datetime.datetime.now()
    job_in_progress = []

    vcf_metadata = prepare_vcf_parsing(filepath)
    db_ref_suffix = "_" + Model.execute(
        "SELECT table_suffix FROM reference WHERE id={}".format(
            reference_id)).first().table_suffix

    # Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema
    for annotation in vcf_metadata['annotations'].keys():
        if vcf_metadata['annotations'][annotation]:
            data = prepare_annotation_db(
                reference_id, vcf_metadata['annotations'][annotation])
            vcf_metadata['annotations'][annotation].update(data)

    if filepath.endswith(".vcf") or filepath.endswith(".vcf.gz"):
        start = datetime.datetime.now()

        # Create vcf parser
        vcf_reader = VariantFile(filepath)

        # get samples in the VCF
        samples = {
            i: Model.get_or_create(Model.session(), Model.Sample, name=i)[0]
            for i in list((vcf_reader.header.samples))
        }

        if len(samples.keys()) == 0:
            war("VCF files without sample cannot be imported in the database.")
            if core is not None:
                core.notify_all({
                    'msg': 'import_vcf_end',
                    'data': {
                        'file_id':
                        file_id,
                        'msg':
                        "VCF files without sample cannot be imported in the database."
                    }
                })
            return

        if core is not None:
            core.notify_all({
                'msg': 'import_vcf_start',
                'data': {
                    'file_id':
                    file_id,
                    'samples': [{
                        'id': samples[s].id,
                        'name': samples[s].name
                    } for s in samples.keys()]
                }
            })

        # Associate sample to the file
        Model.execute(
            "INSERT INTO sample_file (sample_id, file_id) VALUES {0} ON CONFLICT DO NOTHING;"
            .format(','.join([
                "({0}, {1})".format(samples[sid].id, file_id)
                for sid in samples
            ])))

        # parsing vcf file
        records_count = vcf_metadata['count']
        records_current = 0
        table = "variant" + db_ref_suffix
        log("Importing file {0}\n\r\trecords  : {1}\n\r\tsamples  :  ({2}) {3}\n\r\tstart    : {4}"
            .format(filepath, records_count, len(samples.keys()),
                    reprlib.repr([s for s in samples.keys()]), start))
        # bar = Bar('\tparsing  : ', max=records_count, suffix='%(percent).1f%% - %(elapsed_td)s')

        sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}])  WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';"
        sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, bin, chr, pos, ref, alt, genotype, depth) SELECT {0}, id, {1}, '{2}', {3}, '{4}', '{5}', '{6}', {7} FROM variant" + db_ref_suffix + " WHERE bin={1} AND chr={2} AND pos={3} AND ref='{4}' AND alt='{5}' ON CONFLICT (sample_id, variant_id) DO NOTHING;"
        sql_pattern3 = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, transcript_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, transcript_id) DO  NOTHING;"  # TODO : on conflict, shall update fields with value in the VCF to complete database annotation with (maybe) new fields
        sql_query1 = ""
        sql_query2 = ""
        sql_query3 = ""
        count = 0
        for r in vcf_reader:
            records_current += 1
            if core is not None:
                core.notify_all({
                    'msg': 'import_vcf',
                    'data': {
                        'file_id':
                        file_id,
                        'progress_total':
                        records_count,
                        'progress_current':
                        records_current,
                        'progress_percent':
                        round(records_current / max(1, records_count) * 100, 2)
                    }
                })

            chrm = normalize_chr(str(r.chrom))
            samples_array = ','.join([str(samples[s].id) for s in r.samples])
            for sn in r.samples:
                s = r.samples.get(sn)
                if (len(s.alleles) > 0):
                    pos, ref, alt = normalize(r.pos, r.ref, s.alleles[0])
                    if pos is not None and alt != ref:
                        bin = getMaxUcscBin(pos, pos + len(ref))
                        sql_query1 += sql_pattern1.format(
                            table, chrm, pos, ref, alt,
                            is_transition(ref, alt), bin, samples_array)
                        sql_query2 += sql_pattern2.format(
                            samples[sn].id, bin, chrm, pos, ref, alt,
                            normalize_gt(s), get_info(s, 'DP'))
                        count += 1

                    pos, ref, alt = normalize(r.pos, r.ref, s.alleles[1])
                    if pos is not None and alt != ref:
                        bin = getMaxUcscBin(pos, pos + len(ref))
                        sql_query1 += sql_pattern1.format(
                            table, chrm, pos, ref, alt,
                            is_transition(ref, alt), bin, samples_array)
                        sql_query2 += sql_pattern2.format(
                            samples[sn].id, bin, chrm, pos, ref, alt,
                            normalize_gt(s), get_info(s, 'DP'))
                        count += 1

                    # Import custom annotation for the variant
                    for ann_name, metadata in vcf_metadata[
                            'annotations'].items():
                        if metadata:
                            # By transcript (r.info is a list of annotation. Inside we shall find, transcript and allele information to be able to save data for the current variant)
                            for info in r.info[metadata['flag']]:
                                data = info.split('|')
                                q_fields = []
                                q_values = []
                                allele = ""
                                trx_pk = "NULL"
                                for col_pos, col_name in enumerate(
                                        metadata['columns']):
                                    q_fields.append(
                                        metadata['db_map'][col_name]['name'])
                                    val = escape_value_for_sql(data[col_pos])

                                    if col_name == 'Allele':
                                        allele = val.strip().strip("-")
                                    if col_name == metadata['db_pk_field']:
                                        trx_pk = val.strip()

                                    q_values.append(
                                        '\'{}\''.format(val) if val != ''
                                        and val is not None else 'NULL')

                                pos, ref, alt = normalize(
                                    r.pos, r.ref, s.alleles[0])
                                # print(pos, ref, alt, allele)
                                if pos is not None and alt == allele:
                                    # print("ok")
                                    sql_query3 += sql_pattern3.format(
                                        metadata['table'], ','.join(q_fields),
                                        ','.join(q_values), bin, chrm, pos,
                                        ref, alt, trx_pk)
                                    count += 1
                                pos, ref, alt = normalize(
                                    r.pos, r.ref, s.alleles[1])
                                # print(pos, ref, alt, allele)
                                if pos is not None and alt == allele:
                                    # print("ok")
                                    sql_query3 += sql_pattern3.format(
                                        metadata['table'], ','.join(q_fields),
                                        ','.join(q_values), bin, chrm, pos,
                                        ref, alt, trx_pk)
                                    count += 1

                    # manage split big request to avoid sql out of memory transaction
                    if count >= 10000:
                        count = 0
                        # Model.execute_async(transaction1 + transaction2 + transaction3, transaction_end)
                        transaction = sql_query1 + sql_query2 + sql_query3
                        log("VCF import : Execute async query (as coroutine)")
                        await Model.execute_aio(transaction)
                        # job_id = Model.execute_bw(transaction, transaction_end)
                        # job_in_progress.append(job_id)
                        # log("VCF import : Execute async query, new job_id : {}. Jobs running [{}]".format(job_id, ','.join([job_in_progress])))
                        # Reset query buffers
                        sql_query1 = ""
                        sql_query2 = ""
                        sql_query3 = ""

        # Loop done, execute last pending query
        log("VCF import : Execute last async query (as coroutine)")
        transaction = sql_query1 + sql_query2 + sql_query3
        await Model.execute_aio(transaction)
        log("VCF import : Done")

    end = datetime.datetime.now()
    if core is not None:
        core.notify_all({
            'msg': 'import_vcf_end',
            'data': {
                'file_id':
                file_id,
                'msg':
                'Import done without error.',
                'samples': [{
                    'id': samples[s].id,
                    'name': samples[s].name
                } for s in samples.keys()]
            }
        })
Example #17
0
    def check_annotation_table(self):
        """
            Check if annotation table exists and create it according to information collected by the init method
        """
        # check if vep_version table exists

        columns_mapping = {}
        db_uid = Model.execute(
            "SELECT uid FROM annotation_database WHERE name='{}'".format(
                self.table_name)).first()

        if db_uid is not None:
            db_uid = db_uid[0]
        else:
            # Create new table
            pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, regovar_trx_id character varying(100), {1}, CONSTRAINT {0}_ukey UNIQUE (variant_id, regovar_trx_id));"
            query = ""
            db_map = {}
            fields = []
            type_map = {
                "string": "text",
                "int": "integer",
                "float": "real",
                "bool": "boolean",
                "enum": "varchar(50)",
                "list": "varchar(250)[]"
            }
            for col_name in self.columns_definitions.keys():
                fields.append("{} {}".format(
                    col_name,
                    type_map[self.columns_definitions[col_name]["type"]]))
            query += pattern.format(self.table_name, ', '.join(fields))
            query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(
                self.table_name)
            query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(
                self.table_name)

            # Register annotation DB
            db_uid, pk_uid = Model.execute(
                "SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(
                    self.table_name, self.colums_as_pk)).first()
            query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (regovar_trx_id);".format(
                self.table_name)
            query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES "
            q = "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} {{0}} ON {{0}}.bin={{1}}.bin AND {{0}}.chr={{1}}.chr AND {{0}}.pos={{1}}.pos AND {{0}}.ref={{1}}.ref AND {{0}}.alt={{1}}.alt');"
            query += q.format(db_uid, self.reference_id, self.table_name,
                              self.version, self.name, self.description, 30,
                              'transcript', pk_uid)
            query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type, description, meta) VALUES "

            # Register annotation Fields
            fields = [field for field in self.columns_definitions.keys()]
            fields.sort()
            for idx, col_name in enumerate(fields):
                query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}),".format(
                    db_uid, idx, self.columns_definitions[col_name]["order"],
                    col_name.title(),
                    self.columns_definitions[col_name]["type"],
                    self.escape_value_for_sql(
                        self.columns_definitions[col_name]["description"]),
                    "'" + self.escape_value_for_sql(
                        self.columns_definitions[col_name]["meta"]) + "'" if
                    "meta" in self.columns_definitions[col_name] else "NULL")
            Model.execute(query[:-1])
            Model.execute(
                "UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;"
            )

        # # Pre-process of polyphen/sift vcf columns that are split on 2 columns in regovar db
        # self.columns = [self.normalise_annotation_name(s) for s in self.columns]
        # if "sift" in self.columns:
        #     self.columns.extend(["sift_pred", "sift_score"])
        #     self.columns.remove("sift")
        # if "polyphen" in self.columns:
        #     self.columns.extend(["polyphen_pred", "polyphen_score"])
        #     self.columns.remove("polyphen")

        # Retrieve column mapping for column in vcf
        self.columns = [
            self.normalise_annotation_name(s) for s in self.columns
        ]

        for col in Model.execute(
                "SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'"
                .format(db_uid)):
            if col.name in self.columns:
                columns_mapping[col.name] = {
                    'name': col.name,
                    'type': col.type,
                    'name_ui': col.name_ui
                }
        for col in self.columns:
            if col not in columns_mapping.keys():
                columns_mapping[col] = False

        self.db_uid = db_uid
        self.columns_mapping = columns_mapping
        return db_uid, columns_mapping
Example #18
0
    def check_annotation_table(self):
        """
            Check if annotation table exists and create it according to information collected by the init method
        """
        # check if vep_version table exists
        
        columns_mapping = {}
        db_uid = Model.execute("SELECT uid FROM annotation_database WHERE name='{}'".format(self.table_name)).first()
        
        if db_uid is not None:
            db_uid = db_uid[0]
        else:
            # Create new table
            pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, regovar_trx_id character varying(100), {1}, CONSTRAINT {0}_ukey UNIQUE (variant_id, regovar_trx_id));"
            query   = ""
            db_map = {}
            fields = []
            type_map = {"string" : "text", "int" : "integer", "float" : "real", "bool" : "boolean", "enum" : "varchar(50)", "list" : "varchar(250)[]"}
            for col_name in self.columns_definitions.keys():
                fields.append("{} {}".format(col_name, type_map[self.columns_definitions[col_name]["type"]])) 
            query += pattern.format(self.table_name, ', '.join(fields))
            query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(self.table_name)
            query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(self.table_name)
            
            # Register annotation DB
            db_uid, pk_uid = Model.execute("SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(self.table_name, self.colums_as_pk)).first()
            query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (regovar_trx_id);".format(self.table_name)
            query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES "
            q = "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} {{0}} ON {{0}}.bin={{1}}.bin AND {{0}}.chr={{1}}.chr AND {{0}}.pos={{1}}.pos AND {{0}}.ref={{1}}.ref AND {{0}}.alt={{1}}.alt');"
            query += q.format(
                db_uid, 
                self.reference_id, 
                self.table_name, 
                self.version, 
                self.name, 
                self.description, 
                30, 
                'transcript',
                pk_uid)
            query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type, description) VALUES "
            
            # Register annotation Fields
            fields = [field for field in self.columns_definitions.keys()]
            fields.sort()
            for idx, col_name in enumerate(fields):
                query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}'),".format(db_uid, idx, col_name, col_name.title(), self.columns_definitions[col_name]["type"], self.escape_value_for_sql(self.columns_definitions[col_name]["description"]))
            
            Model.execute(query[:-1])
            Model.execute("UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;")
        


        # Retrieve column mapping for column in vcf
        self.columns = [self.normalise_annotation_name(s) for s in self.columns]

        for col in Model.execute("SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'".format(db_uid)):
            if col.name in self.columns:
                columns_mapping[col.name] = {'name': col.name, 'type': col.type, 'name_ui': col.name_ui}
        for col in self.columns:
            if col not in columns_mapping.keys():
                columns_mapping[col] = False

        self.db_uid = db_uid
        self.columns_mapping = columns_mapping
        return db_uid, columns_mapping
Example #19
0
async def import_data(file_id, filepath, core=None, reference_id = 2):
    import ipdb

    import os
    import datetime
    import sqlalchemy
    import subprocess
    import multiprocessing as mp
    import reprlib
    import gzip
    from pysam import VariantFile

    from core.framework.common import log, war, err, RegovarException
    import core.model as Model





    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Tools
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

    def count_vcf_row(filename):
        """
            Use linux OS commands to quickly count variant to parse in the vcf file
        """
        bashCommand = 'grep -v "^#" ' + str(filename) +' | wc -l'
        if filename.endswith("gz"):
            bashCommand = "z" + bashCommand
        process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        cmd_out = process.communicate()[0]
        return int(cmd_out.decode('utf8'))


    def debug_clear_header(filename):
        """
            A workaround to fix a bug with GVCF header with pysam
            EDIT : in fact the problem to be that pysam do not support some kind of compression, so this command 
            is still used to rezip the vcf in a supported format.
        """
        bashCommand = "grep -v '^##GVCFBlock' {} | gzip --best > /var/regovar/downloads/tmp_workaround".format(filename)
        if filename.endswith("gz"):
            bashCommand = "z" + bashCommand
        process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        bashCommand = "mv /var/regovar/downloads/tmp_workaround  {} ".format(filename)
        process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)



    def prepare_vcf_parsing(filename):
        """
            Parse vf headers and return information about which data shall be parsed
            and stored in the database
        """
        # Extract headers
        debug_clear_header(filename)

        headers = {}
        samples = []
        _op = open
        if filename.endswith('gz') or filename.endswith('zip'):
            _op = gzip.open
        with _op(filename) as f:
            for line in f:
                if _op != open:
                    line = line.decode()
                if line.startswith('##'):
                    l = line[2:].strip()
                    l = [l[0:l.index('=')], l[l.index('=')+1:]]
                    if l[0] not in headers.keys():
                        if l[0] == 'INFO' :
                            headers[l[0]] = {}
                        else:
                            headers[l[0]] = []
                    if l[0] == 'INFO' :
                        data = l[1][1:-1].split(',')
                        info_id   = data[0][3:]
                        info_type = data[2][5:]
                        info_desc = data[3][13:-1]
                        headers['INFO'].update({info_id : {'type' : info_type, 'description' : info_desc}})
                    else:
                        headers[l[0]].append(l[1])
                elif line.startswith('#'):
                    samples = line[1:].strip().split('\t')[9:]
                else :
                    break;

        # Check for VEP
        vep = {'vep' : False}
        if 'VEP' in headers.keys() and 'CSQ' in headers['INFO'].keys():
            d = headers['INFO']['CSQ']['description'].split('Format:')
            vep = {
                'vep' : {
                    'version' : headers['VEP'][0].split(' ')[0],
                    'flag' : 'CSQ',
                    'name' : 'VEP',
                    'db_type' : 'transcript',
                    'db_pk_field' : 'Feature',
                    'description' : d[0].strip(),
                    'columns' : d[1].strip().split('|'),
                }
            }
            if 'Feature' not in vep['vep']['columns']:
                vep = {'vep' : False }

        # Check for SnpEff
        snpeff = {'snpeff' : False }
        if 'SnpEffVersion' in headers.keys() :
            if 'ANN' in headers['INFO'].keys():
                # TODO
                pass
            elif 'EFF' in headers['INFO'].keys():
                d = headers['INFO']['EFF']['description'].split('\'')
                snpeff = {
                    'snpeff' : {
                        'version' : headers['SnpEffVersion'][0].strip().strip('"').split(' ')[0],
                        'flag' : 'EFF',
                        'name' : 'SnpEff',
                        'db_type' : 'transcript',
                        'db_pk_field' : 'Transcript_ID',
                        'columns' : [c.strip() for c in d[1].strip().split('|')],
                        'description' : d[0].strip(),
                    }
                }
                if 'Transcript_ID' not in snpeff['snpeff']['columns']:
                    snpeff = {'snpeff' : False }


        # Retrieve extension
        file_type = os.path.split(filename)[1].split('.')[-1]
        if not 'vcf' in file_type :
            file_type += os.path.split(filename)[1].split('.')[-2] + "."

        # Return result
        result = {
            'vcf_version' : headers['fileformat'][0],
            'name'  : os.path.split(filename)[1],
            'count' : count_vcf_row(filename),
            'size'  : os.path.getsize(filename),
            'type'  : file_type,
            'samples' : samples,
            'annotations' : {}
        }
        result['annotations'].update(vep)
        result['annotations'].update(snpeff)
        return result


    def normalise_annotation_name(name):
        """
            Tool to convert a name of a annotation tool/db/field/version into the corresponding valid name for the database
        """
        if name[0].isdigit():
            name = '_'+name
        def check_char(char):
            if char in ['.', '-', '_', '/']:
                return '_'
            elif char.isalnum():
                # TODO : remove accents
                return char.lower()
            else:
                return ''
        return ''.join(check_char(c) for c in name)


    def create_annotation_db(reference_id, reference_name, table_name, vcf_annotation_metadata):
        """
            Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method
        """
        # Create annotation table
        pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata['db_type'] == 'transcript' else ''
        pk2 = ',transcript_id' if vcf_annotation_metadata['db_type'] == 'transcript' else ''
        pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 +"));"
        query   = ""
        db_map = {}
        fields = []
        for col in vcf_annotation_metadata['columns']:
            col_name = normalise_annotation_name(col)
            fields.append("{} text".format(col_name))
            db_map[col_name] = { 'name' : col_name, 'type' : 'string', 'name_ui' : col }  # By default, create a table with only text field. Type can be changed by user via a dedicated UI
        query += pattern.format(table_name, ', '.join(fields))
        query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format(table_name)
        query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format(table_name)
        if vcf_annotation_metadata['db_type'] == 'transcript':
            query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format(table_name)

        # Register annotation
        db_uid, pk_uid = Model.execute("SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format(table_name, normalise_annotation_name(vcf_annotation_metadata['db_pk_field']))).first()
        query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES "
        query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format( # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations
            db_uid, 
            reference_id, 
            table_name, 
            vcf_annotation_metadata['version'], 
            vcf_annotation_metadata['name'], 
            vcf_annotation_metadata['description'], 
            30, 
            vcf_annotation_metadata['db_type'],
            pk_uid)  

        query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES "
        for idx, f in enumerate(vcf_annotation_metadata['columns']):
            query += "('{0}', {1}, '{2}', '{3}', 'string'),".format(db_uid, idx, normalise_annotation_name(f), f)
        Model.execute(query[:-1])
        Model.execute("UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;")
        return db_uid, db_map


    def prepare_annotation_db(reference_id, vcf_annotation_metadata):
        """
            Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema
        """

        reference  = Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first()[0]
        table_name = normalise_annotation_name('{}_{}_{}'.format(vcf_annotation_metadata['flag'], vcf_annotation_metadata['version'], reference))
        
        # Get database schema (if available)
        table_cols = {}
        db_uid     = Model.execute("SELECT uid FROM annotation_database WHERE name='{}'".format(table_name)).first()

        if db_uid is None:
            # No table in db for these annotation : create new table
            db_uid, table_cols = create_annotation_db(reference_id, reference, table_name, vcf_annotation_metadata)
        else:
            db_uid = db_uid[0]
            # Table already exists : retrieve columns already defined
            for col in Model.execute("SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'".format(db_uid)):
                table_cols[col.name] = {'name': col.name, 'type': col.type, 'name_ui': col.name_ui}
        # Get diff between columns in vcf and columns in DB, and update DB schema
        diff = []
        for col in vcf_annotation_metadata['columns']:
            if normalise_annotation_name(col) not in table_cols.keys():
                diff.append(col)
        if len(diff) > 0 :
            offset = len(vcf_annotation_metadata['columns'])
            query = ""
            for idx, col in enumerate(diff):
                name=normalise_annotation_name(col)
                query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format(table_name, name, db_uid, offset + idx, col)
                table_cols[name] = {'name': name, 'type': 'string', 'name_ui': col}

            # execute query
            Model.execute(query)
        # Update vcf_annotation_metadata with database mapping
        db_pk_field_uid = Model.execute("SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'".format(db_uid)).first().db_pk_field_uid
        vcf_annotation_metadata.update({'table': table_name, 'db_uid': db_uid, 'db_pk_field_uid': db_pk_field_uid})
        vcf_annotation_metadata['db_map'] = {}
        for col in vcf_annotation_metadata['columns']:
            vcf_annotation_metadata['db_map'][col] = table_cols[normalise_annotation_name(col)]
        return vcf_annotation_metadata


    def normalize_chr(chrm):
        """
            Normalize chromosome number from VCF format into Database format
        """
        chrm = chrm.upper()
        if chrm.startswith("CHROM"):
            chrm = chrm[5:]
        if chrm.startswith("CHRM") and chrm != "CHRM":
            chrm = chrm[4:]
        if chrm.startswith("CHR"):
            chrm = chrm[3:]

        if chrm == "X":
            chrm = 23
        elif chrm == "Y":
            chrm = 24
        elif chrm == "M":
            chrm = 25
        else:
            try:
                chrm = int(chrm)
            except Exception as error:
                # TODO log /report error
                chrm = None
        return chrm


    def normalize(pos, ref, alt):
        """
            Normalize given (position, ref and alt) from VCF into Database format
             - Assuming that position in VCF are 1-based (0-based in Database)
             - triming ref and alt to get minimal alt (and update position accordingly)
        """
        # input pos comming from VCF are 1-based.
        # to be consistent with UCSC databases we convert it into 0-based
        pos -= 1

        if (ref == alt):
            return None,None,None
        if ref is None:
            ref = ''
        if alt is None:
            alt = ''
        while len(ref) > 0 and len(alt) > 0 and ref[0]==alt[0] :
            ref = ref[1:]
            alt = alt[1:]
            pos += 1
        if len(ref) == len(alt):
            while ref[-1:]==alt[-1:]:
                ref = ref[0:-1]
                alt = alt[0:-1]
        return pos, ref, alt


    def normalize_gt(infos):
        """
            Normalize GT sample informatin from VCF format into Database format
        """
        gt = get_info(infos, 'GT')
        if gt != 'NULL':
            if infos['GT'][0] == infos['GT'][1]:
                # Homozyot ref
                if infos['GT'][0] in [None, 0] : 
                    return 0
                # Homozyot alt
                return '1'
            else :
                if 0 in infos['GT'] :
                    # Hetero ref
                    return '2'
                else :
                    return '3'
            log ("unknow : " + str(infos['GT']) )
        return -1


    def get_alt(alt):
        """
            Retrieve alternative values from VCF data
        """
        if ('|' in alt):
            return alt.split('|')
        else:
            return alt.split('/')


    def get_info(infos, key):
        """
            Retrieving info annotation from VCF data
        """
        if (key in infos):
            if infos[key] is None : return 'NULL'
            return infos[key]
        return 'NULL'



    def is_transition(ref, alt):
        """
            Return true if the variant is a transversion; false otherwise
        """
        tr = ref+alt
        if len(ref) == 1 and tr in ('AG', 'GA', 'CT', 'TC'):
            return True
        return False



    def escape_value_for_sql(value):
        if type(value) is str:
            value = value.replace('%', '%%')
            value = value.replace("'", "''")

        return value





    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Tiers code from vtools.  Bin index calculation 
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


    #
    # Utility function to calculate bins.
    #
    # This function implements a hashing scheme that UCSC uses (developed by Jim Kent) to 
    # take in a genomic coordinate range and return a set of genomic "bins" that your range
    # intersects.  I found a Java implementation on-line (I need to find the URL) and I
    # simply manually converted the Java code into Python code.  
        
    # IMPORTANT: Because this is UCSC code the start coordinates are 0-based and the end 
    # coordinates are 1-based!!!!!!
            
    # BINRANGE_MAXEND_512M = 512 * 1024 * 1024
    # binOffsetOldToExtended = 4681; #  (4096 + 512 + 64 + 8 + 1 + 0)

    _BINOFFSETS = (
        512+64+8+1,   # = 585, min val for level 0 bins (128kb binsize)    
        64+8+1,       # =  73, min val for level 1 bins (1Mb binsize) 
        8+1,          # =   9, min val for level 2 bins (8Mb binsize)  
        1,            # =   1, min val for level 3 bins (64Mb binsize)  
        0)            # =   0, only val for level 4 bin (512Mb binsize)
         
    #    1:   0000 0000 0000 0001    1<<0       
    #    8:   0000 0000 0000 1000    1<<3
    #   64:   0000 0000 0100 0000    1<<6
    #  512:   0000 0010 0000 0000    1<<9
     
    _BINFIRSTSHIFT = 17;            # How much to shift to get to finest bin.
    _BINNEXTSHIFT = 3;              # How much to shift to get to next larger bin.
    _BINLEVELS = len(_BINOFFSETS)
      
    #
    # IMPORTANT: the start coordinate is 0-based and the end coordinate is 1-based.
    #
    def getUcscBins(start, end):
        bins = []
        startBin = start >> _BINFIRSTSHIFT
        endBin = (end-1) >> _BINFIRSTSHIFT
        for i in range(_BINLEVELS):
            offset = _BINOFFSETS[i];
            if startBin == endBin:
                bins.append(startBin + offset)
            else:
                for bin in range(startBin + offset, endBin + offset):
                    bins.append(bin);
            startBin >>= _BINNEXTSHIFT
            endBin >>= _BINNEXTSHIFT
        return bins

    def getMaxUcscBin(start, end):
        bin = 0
        startBin = start >> _BINFIRSTSHIFT
        endBin = (end-1) >> _BINFIRSTSHIFT
        for i in range(_BINLEVELS):
            offset = _BINOFFSETS[i];
            if startBin == endBin:
                if startBin + offset > bin:
                    bin = startBin + offset
            else:
                for i in range(startBin + offset, endBin + offset):
                    if i > bin:
                        bin = i 
            startBin >>= _BINNEXTSHIFT
            endBin >>= _BINNEXTSHIFT
        return bin









    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
    # Import 
    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #




    def transaction_end(job_id, result):
        job_in_progress.remove(job_id)
        if result is Exception or result is None:
            core.notify_all({'msg':'import_vcf_end', 'data' : {'file_id' : file_id, 'msg' : 'Error occured : ' + str(err)}})



    start_0 = datetime.datetime.now()
    job_in_progress = []

    vcf_metadata = prepare_vcf_parsing(filepath)
    db_ref_suffix= "_" + Model.execute("SELECT table_suffix FROM reference WHERE id={}".format(reference_id)).first().table_suffix

    # Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema
    for annotation in vcf_metadata['annotations'].keys():
        if vcf_metadata['annotations'][annotation]:
            data = prepare_annotation_db(reference_id, vcf_metadata['annotations'][annotation])
            vcf_metadata['annotations'][annotation].update(data)


    if filepath.endswith(".vcf") or filepath.endswith(".vcf.gz"):
        start = datetime.datetime.now()

        # Create vcf parser
        vcf_reader = VariantFile(filepath)

        # get samples in the VCF 
        samples = {i : Model.get_or_create(Model.session(), Model.Sample, name=i)[0] for i in list((vcf_reader.header.samples))}

        if len(samples.keys()) == 0 : 
            war("VCF files without sample cannot be imported in the database.")
            if core is not None:
                core.notify_all({'msg':'import_vcf_end', 'data' : {'file_id' : file_id, 'msg' : "VCF files without sample cannot be imported in the database."}})
            return;

        if core is not None:
            core.notify_all({'msg':'import_vcf_start', 'data' : {'file_id' : file_id, 'samples' : [ {'id' : samples[s].id, 'name' : samples[s].name} for s in samples.keys()]}})


        # Associate sample to the file
        Model.execute("INSERT INTO sample_file (sample_id, file_id) VALUES {0} ON CONFLICT DO NOTHING;".format( ','.join(["({0}, {1})".format(samples[sid].id, file_id) for sid in samples])))



        # parsing vcf file
        records_count = vcf_metadata['count']
        records_current = 0
        table = "variant" + db_ref_suffix
        log ("Importing file {0}\n\r\trecords  : {1}\n\r\tsamples  :  ({2}) {3}\n\r\tstart    : {4}".format(filepath, records_count, len(samples.keys()), reprlib.repr([s for s in samples.keys()]), start))
        # bar = Bar('\tparsing  : ', max=records_count, suffix='%(percent).1f%% - %(elapsed_td)s')
        
        sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}])  WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';"
        sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, bin, chr, pos, ref, alt, genotype, depth) SELECT {0}, id, {1}, '{2}', {3}, '{4}', '{5}', '{6}', {7} FROM variant" + db_ref_suffix + " WHERE bin={1} AND chr={2} AND pos={3} AND ref='{4}' AND alt='{5}' ON CONFLICT (sample_id, variant_id) DO NOTHING;"
        sql_pattern3 = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, transcript_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, transcript_id) DO  NOTHING;" # TODO : on conflict, shall update fields with value in the VCF to complete database annotation with (maybe) new fields
        sql_query1 = ""
        sql_query2 = ""
        sql_query3 = ""
        count = 0
        for r in vcf_reader: 
            records_current += 1 
            if core is not None:
                core.notify_all({'msg':'import_vcf', 'data' : {'file_id' : file_id, 'progress_total' : records_count, 'progress_current' : records_current, 'progress_percent' : round(records_current / max(1,records_count) * 100, 2)}})
            
            chrm = normalize_chr(str(r.chrom))
            samples_array = ','.join([str(samples[s].id) for s in r.samples])
            for sn in r.samples:
                s = r.samples.get(sn)
                if (len(s.alleles) > 0) :
                    pos, ref, alt = normalize(r.pos, r.ref, s.alleles[0])
                    if pos is not None and alt != ref :
                        bin = getMaxUcscBin(pos, pos + len(ref))
                        sql_query1 += sql_pattern1.format(table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array)
                        sql_query2 += sql_pattern2.format(samples[sn].id, bin, chrm, pos, ref, alt, normalize_gt(s), get_info(s, 'DP'))
                        count += 1

                    pos, ref, alt = normalize(r.pos, r.ref, s.alleles[1])
                    if pos is not None and alt != ref :
                        bin = getMaxUcscBin(pos, pos + len(ref))
                        sql_query1 += sql_pattern1.format(table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array)
                        sql_query2 += sql_pattern2.format(samples[sn].id, bin, chrm, pos, ref, alt, normalize_gt(s), get_info(s, 'DP'))
                        count += 1


                    # Import custom annotation for the variant
                    for ann_name, metadata in vcf_metadata['annotations'].items():
                        if metadata:
                            # By transcript (r.info is a list of annotation. Inside we shall find, transcript and allele information to be able to save data for the current variant)
                            for info in r.info[metadata['flag']]:
                                data = info.split('|')
                                q_fields = []
                                q_values = []
                                allele   = ""
                                trx_pk = "NULL"
                                for col_pos, col_name in enumerate(metadata['columns']):
                                    q_fields.append(metadata['db_map'][col_name]['name'])
                                    val = escape_value_for_sql(data[col_pos])

                                    if col_name == 'Allele':
                                        allele = val.strip().strip("-")
                                    if col_name == metadata['db_pk_field']:
                                        trx_pk = val.strip()

                                    q_values.append('\'{}\''.format(val) if val != '' and val is not None else 'NULL')

                                pos, ref, alt = normalize(r.pos, r.ref, s.alleles[0])
                                # print(pos, ref, alt, allele)
                                if pos is not None and alt==allele:
                                    # print("ok")
                                    sql_query3 += sql_pattern3.format(metadata['table'], ','.join(q_fields), ','.join(q_values), bin, chrm, pos, ref, alt, trx_pk)
                                    count += 1
                                pos, ref, alt = normalize(r.pos, r.ref, s.alleles[1])
                                # print(pos, ref, alt, allele)
                                if pos is not None and alt==allele:
                                    # print("ok")
                                    sql_query3 += sql_pattern3.format(metadata['table'], ','.join(q_fields), ','.join(q_values), bin, chrm, pos, ref, alt, trx_pk)
                                    count += 1


                    # manage split big request to avoid sql out of memory transaction
                    if count >= 10000:
                        count = 0
                        # Model.execute_async(transaction1 + transaction2 + transaction3, transaction_end)
                        transaction = sql_query1 + sql_query2 + sql_query3
                        log("VCF import : Execute async query (as coroutine)")
                        await Model.execute_aio(transaction)
                        # job_id = Model.execute_bw(transaction, transaction_end)
                        # job_in_progress.append(job_id)
                        # log("VCF import : Execute async query, new job_id : {}. Jobs running [{}]".format(job_id, ','.join([job_in_progress])))
                        # Reset query buffers
                        sql_query1 = ""
                        sql_query2 = ""
                        sql_query3 = ""

        # Loop done, execute last pending query 
        log("VCF import : Execute last async query (as coroutine)")
        transaction = sql_query1 + sql_query2 + sql_query3
        await Model.execute_aio(transaction)
        log("VCF import : Done")


    end = datetime.datetime.now()
    if core is not None:
        core.notify_all({'msg':'import_vcf_end', 'data' : {'file_id' : file_id, 'msg' : 'Import done without error.', 'samples': [ {'id' : samples[s].id, 'name' : samples[s].name} for s in samples.keys()]}})