Beispiel #1
0
def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference=None, assembly=None, pad=AGE_PAD,
                   age=None, truncation_pad_read_age = AGE_TRUNCATION_PAD,
                   max_interval_len_truncation_age = AGE_MAX_INTERVAL_TRUNCATION,
                   dist_to_expected_bp = AGE_DIST_TO_BP, min_del_subalign_len = MIN_DEL_SUBALIGN_LENGTH, 
                   min_inv_subalign_len = MIN_INV_SUBALIGN_LENGTH, age_window = AGE_WINDOW_SIZE,
                   age_workdir=None, timeout=AGE_TIMEOUT, keep_temp=False, myid=0):
    thread_logger = logging.getLogger("%s-%s" % (run_age_single.__name__, multiprocessing.current_process()))

    bedtools_intervals = []
    intervals_bedtool = pybedtools.BedTool(intervals_bed)

    assembly_fasta = pysam.Fastafile(assembly) if assembly else None
    reference_fasta = pysam.Fastafile(reference)

    breakpoints_bed = None

    thread_logger.info("Will process %d intervals" % (len(region_list)))

    try:
        for region in region_list:
            bedtools_interval = pybedtools.Interval(region[0], region[1], region[3])
            matching_intervals = [interval for interval in intervals_bedtool if (
                interval.start == bedtools_interval.start and interval.end == bedtools_interval.end and interval.chrom == bedtools_interval.chrom)]
            if not matching_intervals:
                thread_logger.info("Matching interval not found for %s" % (str(bedtools_interval)))
                matching_interval = bedtools_interval
            else:
                matching_interval = matching_intervals[0]
            thread_logger.info("Matching interval %s" % (str(matching_interval)))
            sc_locations = []
            try:
                sc_locations = map(int, json.loads(base64.b64decode(matching_interval.name.split(",")[0]))["SC_LOCATIONS"].split(","))
            except:
                pass

            if region not in contig_dict:
                continue
            if not contig_dict[region]:
                continue

            region_object = SVRegion(region[0], region[1], region[2], region[3])
            if region_object.pos1 - pad < 0:
                thread_logger.error("Region too close to start of chromosome. Skipping.")
                continue

            reference_sequence = reference_fasta.fetch(reference=region_object.chrom1, start=region_object.pos1 - pad,
                                                       end=region_object.pos2 + pad)
            region_name = "%s.%d.%d" % (region_object.chrom1, region_object.pos1, region_object.pos2)
            ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name)

            thread_logger.info("Writing the ref sequence for region %s" % region_name)
            with open(ref_name, "w") as file_handle:
                file_handle.write(">{}.ref\n{}".format(region_name, reference_sequence))


            

            age_records = []
            thread_logger.info("Processing %d contigs for region %s" % (len(contig_dict[region]), str(region_object)))
            for contig in contig_dict[region]:
                thread_logger.info(
                    "Writing the assembeled sequence %s of length %s" % (contig.raw_name, contig.sequence_len))
                
                tr_region=[]
                if region_object.length()>max_interval_len_truncation_age and contig.sv_type in ["INV","DEL","DUP"]:
                    # For large SVs, middle sequences has no effect on genotyping. So, we truncate middle region of reference to speed up
                    thread_logger.info("Truncate the reference sequence.")
                    

                    truncate_start = pad + dist_to_expected_bp + truncation_pad_read_age +1
                    truncate_end = len(reference_sequence) -  (pad + dist_to_expected_bp + truncation_pad_read_age)
                    reference_sequence_tr=reference_sequence[0:truncate_start-1]+reference_sequence[truncate_end:]
                    region_name_tr = "%s.%d.%d.tr_%d_%d" % (region_object.chrom1, region_object.pos1, region_object.pos2,truncate_start,truncate_end)
                    ref_name_tr = os.path.join(age_workdir, "%s.ref.fa" % region_name_tr)

                    thread_logger.info("Writing the truncated ref sequence for region %s, contig %s" % (region_name_tr, contig.raw_name))
                    with open(ref_name_tr, "w") as file_handle:
                        file_handle.write(">{}.ref\n{}".format(region_name_tr, reference_sequence_tr))
                        
                    ref_len = len(reference_sequence_tr)
                    ref_f_name = ref_name_tr
                    tr_region = [truncate_start,truncate_end-truncate_start+1]
                    
                else:
                    ref_len = region_object.length()
                    ref_f_name = ref_name
                    
                if contig.sequence_len * ref_len >= 100000000:
                    thread_logger.info("Skipping contig because AGE problem is large (contig_len = %d , ref_len= %d)"%(contig.sequence_len, ref_len))
                    continue

                contig_sequence = assembly_fasta.fetch(contig.raw_name)

                prefix = get_age_file_prefix(contig)
                asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix)
                out = os.path.join(age_workdir, "%s.age.out" % prefix)
                err = os.path.join(age_workdir, "%s.age.err" % prefix)
                fd_out = open(out, "w")
                fd_err = open(err, "w")

                with open(asm_name, "w") as file_handle:
                    file_handle.write(">{}.as\n{}".format(region_name, contig_sequence))

                age_cmd = "%s %s -both -go=-6 %s %s" % (
                    age, "-inv" if contig.sv_type == "INV" else "-tdup" if contig.sv_type == "DUP" else "-indel",
                    ref_f_name, asm_name)
                cmd_runner = TimedExternalCmd(age_cmd, thread_logger)
                retcode = cmd_runner.run(timeout=timeout, cmd_log_fd_out=fd_out, cmd_log_fd_err=fd_err)
                fd_out.close()
                fd_err.close()

                if retcode == 0:
                    age_record = AgeRecord(out,tr_region_1=tr_region)
                    if len(age_record.inputs) == 2:
                        age_record.contig = contig
                        age_record.set_assembly_contig(contig_sequence)
                        age_records.append(age_record)
                    else:
                        thread_logger.error("Number of inputs != 2 in age output file %s. Skipping." % out)

                if not keep_temp:
                    os.remove(asm_name)
                    os.remove(err)
                    if tr_region:
                        os.remove(ref_name_tr)

            unique_age_records = get_unique_age_records(age_records)

            thread_logger.info("Unique %d AGE records for region %s" % (len(unique_age_records), str(region_object)))
            for age_record in unique_age_records:
                thread_logger.info(str(age_record))

            sv_types = list(set([age_record.contig.sv_type for age_record in unique_age_records]))
            if len(sv_types) != 1:
                thread_logger.error("Some problem. Mixed SV types for this interval %s" % (str(sv_types)))
            else:
                sv_type = sv_types[0]
                thread_logger.info("Processing region of type %s" % sv_type)
                breakpoints, info_dict = process_age_records(unique_age_records, sv_type=sv_type, 
                                                             pad=pad, dist_to_expected_bp=dist_to_expected_bp,
                                                             min_del_subalign_len=min_del_subalign_len,
                                                             min_inv_subalign_len=min_inv_subalign_len,
                                                             age_window=age_window, sc_locations=sc_locations)
                bedtools_fields = matching_interval.fields
                if len(breakpoints) == 1 and sv_type == "INS":
                    bedtools_fields += map(str, [breakpoints[0][0], breakpoints[0][0] + 1, breakpoints[0][1], breakpoints[0][2]])
                elif len(breakpoints) == 2 and (sv_type in ["DEL","INV","DUP"]):
                    bedtools_fields += map(str, breakpoints + [breakpoints[1] - breakpoints[0]] + ["."])
                else:
                    bedtools_fields += map(str, [bedtools_fields[1], bedtools_fields[2], -1, "."])
                bedtools_fields[3] += ";AS"
                bedtools_fields.append(base64.b64encode(json.dumps(info_dict)))
                thread_logger.info("Writing out fields %s" % (str(bedtools_fields)))
                bedtools_intervals.append(pybedtools.create_interval_from_list(bedtools_fields))

            if not keep_temp:
                os.remove(ref_name)
    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    if assembly_fasta:
        assembly_fasta.close()
    reference_fasta.close()

    thread_logger.info("Writing %d intervals" % (len(bedtools_intervals)))
    if bedtools_intervals:
        breakpoints_bed = os.path.join(age_workdir, "%d_breakpoints.bed" % myid)
        pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed)

    return breakpoints_bed
Beispiel #2
0
def run_age_single(intervals_bed=None, region_list=[], contig_dict={}, reference=None, assembly=None, pad=AGE_PAD,
                   age=None,
                   age_workdir=None, timeout=AGE_TIMEOUT, keep_temp=False, myid=0):
    thread_logger = logging.getLogger("%s-%s" % (run_age_single.__name__, multiprocessing.current_process()))

    bedtools_intervals = []
    intervals_bedtool = pybedtools.BedTool(intervals_bed)

    assembly_fasta = pysam.Fastafile(assembly) if assembly else None
    reference_fasta = pysam.Fastafile(reference)

    breakpoints_bed = None

    thread_logger.info("Will process %d intervals" % (len(region_list)))

    try:
        for region in region_list:
            bedtools_interval = pybedtools.Interval(region[0], region[1], region[3])
            matching_intervals = [interval for interval in intervals_bedtool if (
                interval.start == bedtools_interval.start and interval.end == bedtools_interval.end and interval.chrom == bedtools_interval.chrom)]
            if not matching_intervals:
                thread_logger.info("Matching interval not found for %s" % (str(bedtools_interval)))
                matching_interval = bedtools_interval
            else:
                matching_interval = matching_intervals[0]
            thread_logger.info("Matching interval %s" % (str(matching_interval)))

            if region not in contig_dict:
                continue
            if not contig_dict[region]:
                continue

            region_object = SVRegion(region[0], region[1], region[2], region[3])
            if region_object.pos1 - pad < 0:
                thread_logger.error("Region too close to start of chromosome. Skipping.")
                continue

            reference_sequence = reference_fasta.fetch(reference=region_object.chrom1, start=region_object.pos1 - pad,
                                                       end=region_object.pos2 + pad)
            region_name = "%s.%d.%d" % (region_object.chrom1, region_object.pos1, region_object.pos2)
            ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name)

            thread_logger.info("Writing the ref sequence for region %s" % region_name)
            with open(ref_name, "w") as file_handle:
                file_handle.write(">{}.ref\n{}".format(region_name, reference_sequence))

            age_records = []
            thread_logger.info("Processing %d contigs for region %s" % (len(contig_dict[region]), str(region_object)))
            for contig in contig_dict[region]:
                thread_logger.info(
                    "Writing the assembeled sequence %s of length %s" % (contig.raw_name, contig.sequence_len))
                if contig.sequence_len * region_object.length() >= 100000000:
                    thread_logger.info("Skipping contig because AGE problem is large")
                    continue

                contig_sequence = assembly_fasta.fetch(contig.raw_name)

                prefix = get_age_file_prefix(contig)
                asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix)
                out = os.path.join(age_workdir, "%s.age.out" % prefix)
                err = os.path.join(age_workdir, "%s.age.err" % prefix)

                with open(asm_name, "w") as file_handle:
                    file_handle.write(">{}.as\n{}".format(region_name, contig_sequence))

                age_cmd = "%s %s -both -go=-6 %s %s >%s 2>%s" % (
                    age,
                    "-inv" if contig.sv_type == "INV" else "-indel",
                    ref_name,
                    asm_name,
                    out,
                    err)
                execute_cmd = "timeout %ds %s" % (timeout, age_cmd)

                retcode = run_cmd(execute_cmd, thread_logger, None, None)

                if retcode == 0:
                    age_record = AgeRecord(out)
                    if len(age_record.inputs) == 2:
                        age_record.contig = contig
                        age_records.append(age_record)
                    else:
                        thread_logger.error("Number of inputs != 2 in age output file %s. Skipping." % out)

                if not keep_temp:
                    os.remove(asm_name)
                    os.remove(err)

            unique_age_records = get_unique_age_records(age_records)

            thread_logger.info("Unique %d AGE records for region %s" % (len(unique_age_records), str(region_object)))
            for age_record in unique_age_records:
                thread_logger.info(str(age_record))

            sv_types = list(set([age_record.contig.sv_type for age_record in unique_age_records]))
            if len(sv_types) != 1:
                thread_logger.error("Some problem. Mixed SV types for this interval %s" % (str(sv_types)))
            else:
                sv_type = sv_types[0]
                thread_logger.info("Processing region of type %s" % sv_type)
                breakpoints, info_dict = process_age_records(unique_age_records, sv_type=sv_type, pad=pad)
                bedtools_fields = matching_interval.fields
                if len(breakpoints) == 1 and sv_type == "INS":
                    bedtools_fields += map(str, [breakpoints[0][0], breakpoints[0][0] + 1, breakpoints[0][1]])
                elif len(breakpoints) == 2 and (sv_type == "DEL" or sv_type == "INV"):
                    bedtools_fields += map(str, breakpoints + [breakpoints[1] - breakpoints[0]])
                else:
                    bedtools_fields += map(str, [bedtools_fields[1], bedtools_fields[2], -1])
                bedtools_fields.append(base64.b64encode(json.dumps(info_dict)))
                thread_logger.info("Writing out fields %s" % (str(bedtools_fields)))
                bedtools_intervals.append(pybedtools.create_interval_from_list(bedtools_fields))

            if not keep_temp:
                os.remove(ref_name)
    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    if assembly_fasta:
        assembly_fasta.close()
    reference_fasta.close()

    thread_logger.info("Writing %d intervals" % (len(bedtools_intervals)))
    if bedtools_intervals:
        breakpoints_bed = os.path.join(age_workdir, "%d_breakpoints.bed" % myid)
        pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed)

    return breakpoints_bed
Beispiel #3
0
def run_age_single(intervals_bed=None,
                   region_list=[],
                   contig_dict={},
                   reference=None,
                   assembly=None,
                   pad=AGE_PAD,
                   age=None,
                   truncation_pad_read_age=AGE_TRUNCATION_PAD,
                   max_interval_len_truncation_age=AGE_MAX_INTERVAL_TRUNCATION,
                   dist_to_expected_bp=AGE_DIST_TO_BP,
                   min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH,
                   min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH,
                   age_window=AGE_WINDOW_SIZE,
                   age_workdir=None,
                   timeout=AGE_TIMEOUT,
                   keep_temp=False,
                   myid=0):
    thread_logger = logging.getLogger(
        "%s-%s" % (run_age_single.__name__, multiprocessing.current_process()))

    bedtools_intervals = []
    intervals_bedtool = pybedtools.BedTool(intervals_bed)

    assembly_fasta = pysam.Fastafile(assembly) if assembly else None
    reference_fasta = pysam.Fastafile(reference)

    breakpoints_bed = None

    thread_logger.info("Will process %d intervals" % (len(region_list)))

    try:
        for region in region_list:
            bedtools_interval = pybedtools.Interval(region[0], region[1],
                                                    region[3])
            matching_intervals = [
                interval for interval in intervals_bedtool
                if (interval.start == bedtools_interval.start
                    and interval.end == bedtools_interval.end
                    and interval.chrom == bedtools_interval.chrom)
            ]
            if not matching_intervals:
                thread_logger.info("Matching interval not found for %s" %
                                   (str(bedtools_interval)))
                matching_interval = bedtools_interval
            else:
                matching_interval = matching_intervals[0]
            thread_logger.info("Matching interval %s" %
                               (str(matching_interval)))
            sc_locations = []
            try:
                sc_locations = map(
                    int,
                    json.loads(
                        base64.b64decode(matching_interval.name.split(",")[0]))
                    ["SC_LOCATIONS"].split(","))
            except:
                pass

            if region not in contig_dict:
                continue
            if not contig_dict[region]:
                continue

            region_object = SVRegion(region[0], region[1], region[2],
                                     region[3])
            if region_object.pos1 - pad < 0:
                thread_logger.error(
                    "Region too close to start of chromosome. Skipping.")
                continue

            reference_sequence = reference_fasta.fetch(
                reference=region_object.chrom1,
                start=region_object.pos1 - pad,
                end=region_object.pos2 + pad)
            region_name = "%s.%d.%d" % (region_object.chrom1,
                                        region_object.pos1, region_object.pos2)
            ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name)

            thread_logger.info("Writing the ref sequence for region %s" %
                               region_name)
            with open(ref_name, "w") as file_handle:
                file_handle.write(">{}.ref\n{}".format(region_name,
                                                       reference_sequence))

            age_records = []
            thread_logger.info("Processing %d contigs for region %s" %
                               (len(contig_dict[region]), str(region_object)))
            for contig in contig_dict[region]:
                thread_logger.info(
                    "Writing the assembeled sequence %s of length %s" %
                    (contig.raw_name, contig.sequence_len))

                tr_region = []
                if region_object.length(
                ) > max_interval_len_truncation_age and contig.sv_type in [
                        "INV", "DEL", "DUP"
                ]:
                    # For large SVs, middle sequences has no effect on genotyping. So, we truncate middle region of reference to speed up
                    thread_logger.info("Truncate the reference sequence.")

                    truncate_start = pad + dist_to_expected_bp + truncation_pad_read_age + 1
                    truncate_end = len(reference_sequence) - (
                        pad + dist_to_expected_bp + truncation_pad_read_age)
                    reference_sequence_tr = reference_sequence[
                        0:truncate_start -
                        1] + reference_sequence[truncate_end:]
                    region_name_tr = "%s.%d.%d.tr_%d_%d" % (
                        region_object.chrom1, region_object.pos1,
                        region_object.pos2, truncate_start, truncate_end)
                    ref_name_tr = os.path.join(age_workdir,
                                               "%s.ref.fa" % region_name_tr)

                    thread_logger.info(
                        "Writing the truncated ref sequence for region %s, contig %s"
                        % (region_name_tr, contig.raw_name))
                    with open(ref_name_tr, "w") as file_handle:
                        file_handle.write(">{}.ref\n{}".format(
                            region_name_tr, reference_sequence_tr))

                    ref_len = len(reference_sequence_tr)
                    ref_f_name = ref_name_tr
                    tr_region = [
                        truncate_start, truncate_end - truncate_start + 1
                    ]

                else:
                    ref_len = region_object.length()
                    ref_f_name = ref_name

                if contig.sequence_len * ref_len >= 100000000:
                    thread_logger.info(
                        "Skipping contig because AGE problem is large (contig_len = %d , ref_len= %d)"
                        % (contig.sequence_len, ref_len))
                    continue

                contig_sequence = assembly_fasta.fetch(contig.raw_name)

                prefix = get_age_file_prefix(contig)
                asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix)
                out = os.path.join(age_workdir, "%s.age.out" % prefix)
                err = os.path.join(age_workdir, "%s.age.err" % prefix)
                fd_out = open(out, "w")
                fd_err = open(err, "w")

                with open(asm_name, "w") as file_handle:
                    file_handle.write(">{}.as\n{}".format(
                        region_name, contig_sequence))

                age_cmd = "%s %s -both -go=-6 %s %s" % (
                    age, "-inv" if contig.sv_type == "INV" else
                    "-tdup" if contig.sv_type == "DUP" else "-indel",
                    ref_f_name, asm_name)
                cmd_runner = TimedExternalCmd(age_cmd, thread_logger)
                retcode = cmd_runner.run(timeout=timeout,
                                         cmd_log_fd_out=fd_out,
                                         cmd_log_fd_err=fd_err)
                fd_out.close()
                fd_err.close()

                if retcode == 0:
                    age_record = AgeRecord(out, tr_region_1=tr_region)
                    if len(age_record.inputs) == 2:
                        age_record.contig = contig
                        age_record.set_assembly_contig(contig_sequence)
                        age_records.append(age_record)
                    else:
                        thread_logger.error(
                            "Number of inputs != 2 in age output file %s. Skipping."
                            % out)

                if not keep_temp:
                    os.remove(asm_name)
                    os.remove(err)
                    if tr_region:
                        os.remove(ref_name_tr)

            unique_age_records = get_unique_age_records(age_records)

            thread_logger.info("Unique %d AGE records for region %s" %
                               (len(unique_age_records), str(region_object)))
            for age_record in unique_age_records:
                thread_logger.info(str(age_record))

            sv_types = list(
                set([
                    age_record.contig.sv_type
                    for age_record in unique_age_records
                ]))
            if len(sv_types) != 1:
                thread_logger.error(
                    "Some problem. Mixed SV types for this interval %s" %
                    (str(sv_types)))
            else:
                sv_type = sv_types[0]
                thread_logger.info("Processing region of type %s" % sv_type)
                breakpoints, info_dict = process_age_records(
                    unique_age_records,
                    sv_type=sv_type,
                    pad=pad,
                    dist_to_expected_bp=dist_to_expected_bp,
                    min_del_subalign_len=min_del_subalign_len,
                    min_inv_subalign_len=min_inv_subalign_len,
                    age_window=age_window,
                    sc_locations=sc_locations)
                bedtools_fields = matching_interval.fields
                if len(breakpoints) == 1 and sv_type == "INS":
                    bedtools_fields += map(str, [
                        breakpoints[0][0], breakpoints[0][0] + 1,
                        breakpoints[0][1], breakpoints[0][2]
                    ])
                elif len(breakpoints) == 2 and (sv_type
                                                in ["DEL", "INV", "DUP"]):
                    bedtools_fields += map(
                        str, breakpoints + [breakpoints[1] - breakpoints[0]] +
                        ["."])
                else:
                    bedtools_fields += map(
                        str, [bedtools_fields[1], bedtools_fields[2], -1, "."])
                bedtools_fields[3] += ";AS"
                bedtools_fields.append(base64.b64encode(json.dumps(info_dict)))
                thread_logger.info("Writing out fields %s" %
                                   (str(bedtools_fields)))
                bedtools_intervals.append(
                    pybedtools.create_interval_from_list(bedtools_fields))

            if not keep_temp:
                os.remove(ref_name)
    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    if assembly_fasta:
        assembly_fasta.close()
    reference_fasta.close()

    thread_logger.info("Writing %d intervals" % (len(bedtools_intervals)))
    if bedtools_intervals:
        breakpoints_bed = os.path.join(age_workdir,
                                       "%d_breakpoints.bed" % myid)
        pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed)

    return breakpoints_bed
def run_age_single(intervals_bed=None,
                   region_list=[],
                   contig_dict={},
                   reference=None,
                   assembly=None,
                   pad=AGE_PAD,
                   age=None,
                   age_workdir=None,
                   timeout=AGE_TIMEOUT,
                   keep_temp=False,
                   myid=0):
    thread_logger = logging.getLogger(
        "%s-%s" % (run_age_single.__name__, multiprocessing.current_process()))

    bedtools_intervals = []
    intervals_bedtool = pybedtools.BedTool(intervals_bed)

    assembly_fasta = pysam.Fastafile(assembly)
    reference_fasta = pysam.Fastafile(reference)

    breakpoints_bed = None

    thread_logger.info("Will process %d intervals" % (len(region_list)))

    try:
        for region in region_list:
            bedtools_interval = pybedtools.Interval(region[0], region[1],
                                                    region[3])
            matching_intervals = [
                interval for interval in intervals_bedtool
                if (interval.start == bedtools_interval.start
                    and interval.end == bedtools_interval.end
                    and interval.chrom == bedtools_interval.chrom)
            ]
            if not matching_intervals:
                thread_logger.info("Matching interval not found for %s" %
                                   (str(bedtools_interval)))
                matching_interval = bedtools_interval
            else:
                matching_interval = matching_intervals[0]
            thread_logger.info("Matching interval %s" %
                               (str(matching_interval)))

            if region not in contig_dict:
                continue
            if not contig_dict[region]:
                continue

            region_object = SVRegion(region[0], region[1], region[2],
                                     region[3])
            if region_object.pos1 - pad < 0:
                thread_logger.error(
                    "Region too close to start of chromosome. Skipping.")
                continue

            reference_sequence = reference_fasta.fetch(
                reference=region_object.chrom1,
                start=region_object.pos1 - pad,
                end=region_object.pos2 + pad)
            region_name = "%s.%d.%d" % (region_object.chrom1,
                                        region_object.pos1, region_object.pos2)
            ref_name = os.path.join(age_workdir, "%s.ref.fa" % region_name)

            thread_logger.info("Writing the ref sequence for region %s" %
                               region_name)
            with open(ref_name, "w") as file_handle:
                file_handle.write(">{}.ref\n{}".format(region_name,
                                                       reference_sequence))

            age_records = []
            thread_logger.info("Processing %d contigs for region %s" %
                               (len(contig_dict[region]), str(region_object)))
            for contig in contig_dict[region]:
                thread_logger.info(
                    "Writing the assembeled sequence %s of length %s" %
                    (contig.raw_name, contig.sequence_len))
                if contig.sequence_len * region_object.length() >= 100000000:
                    thread_logger.info(
                        "Skipping contig because AGE problem is large")
                    continue

                contig_sequence = assembly_fasta.fetch(contig.raw_name)

                prefix = get_age_file_prefix(contig)
                asm_name = os.path.join(age_workdir, "%s.as.fa" % prefix)
                out = os.path.join(age_workdir, "%s.age.out" % prefix)
                err = os.path.join(age_workdir, "%s.age.err" % prefix)

                with open(asm_name, "w") as file_handle:
                    file_handle.write(">{}.as\n{}".format(
                        region_name, contig_sequence))

                age_cmd = "%s %s -both -go=-6 %s %s >%s 2>%s" % (
                    age, "-inv" if contig.sv_type == "INV" else "-indel",
                    ref_name, asm_name, out, err)
                execute_cmd = "timeout %ds %s" % (timeout, age_cmd)

                retcode = run_cmd(execute_cmd, thread_logger, None, None)

                if retcode == 0:
                    age_record = AgeRecord(out)
                    if len(age_record.inputs) == 2:
                        age_record.contig = contig
                        age_records.append(age_record)
                    else:
                        thread_logger.error(
                            "Number of inputs != 2 in age output file %s. Skipping."
                            % out)

                if not keep_temp:
                    os.remove(asm_name)
                    os.remove(err)

            unique_age_records = get_unique_age_records(age_records)

            thread_logger.info("Unique %d AGE records for region %s" %
                               (len(unique_age_records), str(region_object)))
            for age_record in unique_age_records:
                thread_logger.info(str(age_record))

            sv_types = list(
                set([
                    age_record.contig.sv_type
                    for age_record in unique_age_records
                ]))
            if len(sv_types) != 1:
                thread_logger.error(
                    "Some problem. Mixed SV types for this interval %s" %
                    (str(sv_types)))
            else:
                sv_type = sv_types[0]
                thread_logger.info("Processing region of type %s" % sv_type)
                breakpoints, info_dict = process_age_records(
                    unique_age_records, sv_type=sv_type, pad=pad)
                bedtools_fields = matching_interval.fields
                if len(breakpoints) == 1 and sv_type == "INS":
                    bedtools_fields += map(str, [
                        breakpoints[0][0], breakpoints[0][0] + 1,
                        breakpoints[0][1]
                    ])
                elif len(breakpoints) == 2 and (sv_type == "DEL"
                                                or sv_type == "INV"):
                    bedtools_fields += map(
                        str, breakpoints + [breakpoints[1] - breakpoints[0]])
                else:
                    bedtools_fields += map(
                        str, [bedtools_fields[1], bedtools_fields[2], -1])
                bedtools_fields.append(base64.b64encode(json.dumps(info_dict)))
                thread_logger.info("Writing out fields %s" %
                                   (str(bedtools_fields)))
                bedtools_intervals.append(
                    pybedtools.create_interval_from_list(bedtools_fields))

            if not keep_temp:
                os.remove(ref_name)
    except Exception as e:
        thread_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    assembly_fasta.close()
    reference_fasta.close()

    thread_logger.info("Writing %d intervals" % (len(bedtools_intervals)))
    if bedtools_intervals:
        breakpoints_bed = os.path.join(age_workdir,
                                       "%d_breakpoints.bed" % myid)
        pybedtools.BedTool(bedtools_intervals).saveas(breakpoints_bed)

    return breakpoints_bed