Ejemplo n.º 1
0
def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None):
    """Generalized annotation of variants with a new column.

    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update,
    to avoid concurrent database access errors from sqlite, especially on
    NFS systems. The retained to_update list is small, but batching
    could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    anno = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(anno)
    cursor = conn.bind.connect()
    add_requested_columns(args, cursor, col_names, col_types)
    conn.commit()
    cursor.close()

    conn, metadata = database.get_session_metadata(str(conn.bind.url))
    cursor = conn.bind.connect()

    last_id = 0
    current_id = 0
    total = 0
    CHUNK_SIZE = 100000
    to_update = []

    select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''')
    while True:
        for row in select_res.fetchmany(CHUNK_SIZE):

            # update_data starts out as a list of the values that should
            # be used to populate the new columns for the current row.
            # Prefer no pysam parsing over tuple parsing to work around bug in pysam 0.8.0
            # https://github.com/pysam-developers/pysam/pull/44
            if args.anno_file.endswith(('.vcf', '.vcf.gz')):
                update_data = get_val_fn(annotations_in_vcf(row, anno, None, naming, args.region_only, True))
            else:
                update_data = get_val_fn(annotations_in_region(row, anno, None, naming))
            #update_data = get_val_fn(annotations_in_region(row, anno, "tuple", naming))
            # were there any hits for this row?
            if len(update_data) > 0:
                # we add the primary key to update_data for the
                # where clause in the SQL UPDATE statement.
                update_data.append(str(row["variant_id"]))
                to_update.append(tuple(update_data))

            current_id = row["variant_id"]

        if current_id <= last_id:
            break
        else:
            _update_variants(metadata, to_update, col_names, cursor)

            total += len(to_update)
            print "updated", total, "variants"
            last_id = current_id
        to_update = []
Ejemplo n.º 2
0
def _annotate_variants(args, conn, get_val_fn):
    """Generalized annotation of variants with a new column.
    
    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update, to avoid concurrent
    database access errors from sqlite3, especially on NFS systems. The retained
    to_update list is small, but batching could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    annos = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(annos)
    select_cursor = conn.cursor()
    select_cursor.execute("SELECT chrom, start, end, variant_id FROM variants")
    to_update = []
    for row in select_cursor:
        to_update.append((str(row["variant_id"]),
                          get_val_fn(annotations_in_region(row, annos, "tuple", naming))))
    update_cursor = conn.cursor()
    add_requested_column(args.col_name, update_cursor)
    for variant_id, val in to_update:
        update_qry = "UPDATE variants SET " + args.col_name + " = " + str(val) + \
                     " WHERE variant_id = " + variant_id
        update_cursor.execute(update_qry)
Ejemplo n.º 3
0
def _annotate_variants(args, conn, metadata, get_val_fn, col_names=None, col_types=None, col_ops=None):
    """Generalized annotation of variants with a new column.

    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update,
    to avoid concurrent database access errors from sqlite, especially on
    NFS systems. The retained to_update list is small, but batching
    could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    anno = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(anno)
    cursor = conn.bind.connect()
    add_requested_columns(args, cursor, col_names, col_types)
    conn.commit()
    cursor.close()

    conn, metadata = database.get_session_metadata(str(conn.bind.url))
    cursor = conn.bind.connect()

    last_id = 0
    current_id = 0
    total = 0
    CHUNK_SIZE = 100000
    to_update = []

    select_res = cursor.execution_options(stream_results=True).execute('''SELECT chrom, start, end, ref, alt, variant_id FROM variants''')
    while True:
        for row in select_res.fetchmany(CHUNK_SIZE):

            # update_data starts out as a list of the values that should
            # be used to populate the new columns for the current row.
            # Prefer no pysam parsing over tuple parsing to work around bug in pysam 0.8.0
            # https://github.com/pysam-developers/pysam/pull/44
            if args.anno_file.endswith(('.vcf', '.vcf.gz')):
                update_data = get_val_fn(annotations_in_vcf(row, anno, None, naming, args.region_only, True))
            else:
                update_data = get_val_fn(annotations_in_region(row, anno, None, naming))
            #update_data = get_val_fn(annotations_in_region(row, anno, "tuple", naming))
            # were there any hits for this row?
            if len(update_data) > 0:
                # we add the primary key to update_data for the
                # where clause in the SQL UPDATE statement.
                update_data.append(str(row["variant_id"]))
                to_update.append(tuple(update_data))

            current_id = row["variant_id"]

        if current_id <= last_id:
            break
        else:
            _update_variants(metadata, to_update, col_names, cursor)

            total += len(to_update)
            print "updated", total, "variants"
            last_id = current_id
        to_update = []
Ejemplo n.º 4
0
def _annotate_variants(args,
                       conn,
                       get_val_fn,
                       col_names=None,
                       col_types=None,
                       col_ops=None):
    """Generalized annotation of variants with a new column.

    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update,
    to avoid concurrent database access errors from sqlite3, especially on
    NFS systems. The retained to_update list is small, but batching
    could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    anno = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(anno)
    select_cursor = conn.cursor()
    update_cursor = conn.cursor()
    add_requested_columns(args, select_cursor, col_names, col_types)

    last_id = 0
    current_id = 0
    total = 0
    CHUNK_SIZE = 100000
    to_update = []

    select_cursor.execute(
        '''SELECT chrom, start, end, variant_id FROM variants''')
    while True:
        for row in select_cursor.fetchmany(CHUNK_SIZE):

            # update_data starts out as a list of the values that should
            # be used to populate the new columns for the current row.
            update_data = get_val_fn(
                annotations_in_region(row, anno, "tuple", naming))
            # were there any hits for this row?
            if len(update_data) > 0:
                # we add the primary key to update_data for the
                # where clause in the SQL UPDATE statement.
                update_data.append(str(row["variant_id"]))
                to_update.append(tuple(update_data))

            current_id = row["variant_id"]

        if current_id <= last_id:
            break
        else:
            update_cursor.execute("BEGIN TRANSACTION")
            _update_variants(to_update, col_names, update_cursor)
            update_cursor.execute("END TRANSACTION")

            total += len(to_update)
            print "updated", total, "variants"
            last_id = current_id
        to_update = []
Ejemplo n.º 5
0
def _annotate_variants(args, conn, get_val_fn, col_names=None, col_types=None, col_ops=None):
    """Generalized annotation of variants with a new column.

    get_val_fn takes a list of annotations in a region and returns
    the value for that region to update the database with.

    Separates selection and identification of values from update,
    to avoid concurrent database access errors from sqlite3, especially on
    NFS systems. The retained to_update list is small, but batching
    could help if memory issues emerge.
    """
    # For each, use Tabix to detect overlaps with the user-defined
    # annotation file.  Update the variant row with T/F if overlaps found.
    anno = pysam.Tabixfile(args.anno_file)
    naming = guess_contig_naming(anno)
    select_cursor = conn.cursor()
    update_cursor = conn.cursor()
    add_requested_columns(args, select_cursor, col_names, col_types)

    last_id = 0
    current_id = 0
    total = 0
    CHUNK_SIZE = 100000
    to_update = []

    select_cursor.execute('''SELECT chrom, start, end, variant_id FROM variants''')
    while True:
        for row in select_cursor.fetchmany(CHUNK_SIZE):

            # update_data starts out as a list of the values that should
            # be used to populate the new columns for the current row.
            update_data = get_val_fn(annotations_in_region(row,
                                                    anno,
                                                    "tuple",
                                                    naming))
            # were there any hits for this row?
            if len(update_data) > 0:
                # we add the primary key to update_data for the
                # where clause in the SQL UPDATE statement.
                update_data.append(str(row["variant_id"]))
                to_update.append(tuple(update_data))

            current_id = row["variant_id"]

        if current_id <= last_id:
            break
        else:
            update_cursor.execute("BEGIN TRANSACTION")
            _update_variants(to_update, col_names, update_cursor)
            update_cursor.execute("END TRANSACTION")

            total += len(to_update)
            print "updated", total, "variants"
            last_id = current_id
        to_update = []