Esempio n. 1
0
 def check_and_write_data(self, current_lo, bed_file):
     if self.gene == None:
         return self.trans[0].check_and_write_data(current_lo, bed_file)
     # just write the gene if we're a lone gene and it's passed
     elif len(self.trans) == 0 and current_lo > self.gene['hi']:
         output_row = default_bed_line[:]
         output_row[bed_col['chr']] = self.gene['chr']
         output_row[bed_col['chr']] = self.gene['lo']
         output_row[bed_col['chr']] = self.gene['hi']
         output_row[bed_col['chr']] = self.gene['name']
         output_row[bed_col['chr']] = self.gene['strand']
         if "thick_start" in self.gene:
             if self.gene['thick_start'] != dxpy.NULL:
                 output_row[bed_col['thick_start']] = str(
                     self.gene['thick_start'])
         if "thick_end" in self.gene:
             if self.gene['thick_end'] != dxpy.NULL:
                 output_row[bed_col['thick_end']] = str(
                     self.gene['thick_end'])
         if "score" in self.gene:
             if self.gene['score'] != dxpy.NULL:
                 output_row[bed_col['score']] = str(self.gene['score'])
         return True
     elif current_lo > self.gene['hi']:
         for t in self.trans:
             if t.check_and_write_data(current_lo, bed_file) != True:
                 raise dxpy.AppError(
                     "found end of gene but not end of transcript: " +
                     str(self.gene))
         return True
     else:
         return False
Esempio n. 2
0
def import_BED(**args):
    if len(args) == 0:
        cmd_line_args = parser.parse_args(sys.argv[1:])
        args['filename'] = cmd_line_args.filename
        args['reference'] = cmd_line_args.reference
        args['file_id'] = cmd_line_args.file_id
        args['additional_type'] = cmd_line_args.additional_type
        args['property_key'] = cmd_line_args.property_key
        args['property_value'] = cmd_line_args.property_value
        args['tag'] = cmd_line_args.tag

    bed_filename = args['filename']
    reference = args['reference']
    file_id = args['file_id']
    additional_types = args['additional_type']
    property_keys = args['property_key']
    property_values = args['property_value']
    tags = args['tag']

    job_outputs = []
    # uncompresses file if necessary.  Returns new filename
    bed_filename_uncomp = unpack( bed_filename )

    current_file = 1

    for import_filename in split_on_track(bed_filename_uncomp):
        try:
            bed_basename = os.path.basename(bed_filename)
        except:
            bed_basename = bed_filename
        if current_file == 1:
            name = bed_basename
        else:
            name = bed_basename+"_"+str(current_file)
        current_file += 1
        bed_type = detect_type(import_filename)["type"]
        delimiter = detect_type(import_filename)["delimiter"]

        print("Bed type is : " + bed_type, file=sys.stderr)
        if bed_type == "genes":
            print("Importing as Genes Type", file=sys.stderr)
            job_outputs.append(import_genes(import_filename, name, reference, file_id, additional_types, property_keys, property_values, tags, delimiter))
        elif bed_type == "spans" or bed_type == "bedDetail":
            print("Importing as Spans Type", file=sys.stderr)
            if bed_type == "bedDetail":
                print("input file is in 'bedDetails' format...", file=sys.stderr)
                bedDetail=True
            else:
                bedDetail=False
            job_outputs.append(import_spans(import_filename, name, reference, file_id, additional_types, property_keys, property_values, tags, bedDetail, delimiter))
        else:
            raise dxpy.AppError("Unable to determine type of BED file")

        subprocess.check_call(" ".join(["rm", import_filename]), shell=True)

    if(bed_filename != bed_filename_uncomp):
        subprocess.check_call(" ".join(["rm", bed_filename_uncomp]), shell=True)

    print(json.dumps(job_outputs))
    return job_outputs
Esempio n. 3
0
def RunPindel(kwargs, pindel_command, output_path):
    folder = output_path.split("/")[0]
    print "Making folder for output: " + str(folder)
    os.mkdir(folder)

    print "Running pindel with: "
    print '\t' + str(pindel_command)
    start_time = time.time()
    try:
        p = subprocess.check_output(pindel_command,
                                    stderr=subprocess.STDOUT,
                                    shell=True)
        print p
        tot_time = time.time() - start_time
        hours = int(tot_time / 3600)
        mins = int(float(tot_time % 3600) / 60)
        secs = tot_time % 60
        print "Pindel ran in: {hrs}h {mins}m {secs}s".format(hrs=hours,
                                                             mins=mins,
                                                             secs=secs)
    except subprocess.CalledProcessError, e:
        print "\n" + str(e.output)
        raise dxpy.AppError(
            "Pindel failed to run. Please check job logs for pindel output. If error is a segmentation fault "
            +
            "raised as pindel begins to run, check that reference FASTA file is the same reference used to produce the mappings"
        )
Esempio n. 4
0
def ExportVCF(kwargs, output_path, ref_fn):
    ref_name_version = dxpy.describe(kwargs["reference_fasta"])["name"]
    ref_name_version = ref_name_version.rstrip(".fa")
    vcf_out_fn = kwargs["output_prefix"] + '.pindel.vcf'

    command_args = ["pindel2vcf"]
    command_args.append("-r {input}".format(input=ref_fn))
    command_args.append("-P {input}".format(input=output_path))
    command_args.append("-v {input}".format(input=vcf_out_fn))
    if kwargs["vcf_gatk_compatible"]:
        command_args.append("-G")

    if "export_vcf_advanced_options" in kwargs:
        command_args.append(kwargs["export_vcf_advanced_options"])
    else:
        ref_date = str(datetime.date.today())
        command_args.append("-R {input}".format(input=ref_name_version))
        command_args.append("-d ''")

    try:
        vcf_command = " ".join(command_args)
        print "Executing: " + vcf_command
        print subprocess.check_output(vcf_command,
                                      stderr=subprocess.STDOUT,
                                      shell=True)
    except subprocess.CalledProcessError, e:
        print e
        print e.output
        raise dxpy.AppError(
            "APP ERROR: App was not able to convert pindel to vcf. Please check pindel2vcf inputs"
        )
Esempio n. 5
0
def group_files_by_read(fastq_files):
    """
    Function : Groups a list of FASTQ files by the values of their Read property that indicates the read number.
                       Returns a dict mapping each observed value of the property (or 'none' if a file does not have a value
                         for the property) to a list of the files with that value. Within each group, the files are sorted by their
                       value of the Chunk property (to ensure that left and right reads of a given chunk are handled together.
    Args     : fastq_files - a list of dxpy.DXFile objects representing FASTQ files.
    Returns  : dict.
    """

    print("Grouping Fastq files by read number")
    read_dict = {}

    for fastq_file in fastq_files:
        props = fastq_file.get_properties()
        read_num = props["read"]
        if read_num not in ["1", "2", "none"]:
            raise dxpy.AppError("%s has invalid Read property: %s" %
                                (fastq_file.get_id(), read_num))
        if read_num not in read_dict:
            read_dict[read_num] = []
        read_dict[read_num].append(fastq_file)

    #for read_num in read_dict:
    #    read_dict[read_num] = sorted(read_dict[read_num], key=chunk_property)

    return read_dict
Esempio n. 6
0
def main(**job_inputs):
    # If we weren't provided a mmi index for the reference, generate it.
    if 'genome_mmi' not in job_inputs:
        mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']}
        minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index')
        job_inputs['genome_mmi'] = minimap_index_job.get_output_ref(
            'genome_mmi')
    output = {'genome_mmi': job_inputs['genome_mmi']}

    # check if we're dealing with pacbio or ONT reads and what the filetype is
    datatype = job_inputs['datatype']
    one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name']
    try:
        file_ext = re.search("(fastq|fasta|fa|fq){1}(.gz)?$",
                             one_reads_file,
                             flags=re.I).group(1).lower()
    except AttributeError:
        raise dxpy.AppError("Invalid filetype extension supplied.")

    # for fasta and fastq inputs, run jobs using native minimap2
    jobs = run_minimap2_subjobs(job_inputs)

    output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs]
    output['bai_files'] = [
        j.get_output_ref('mapped_reads_index') for j in jobs
    ]

    return output
Esempio n. 7
0
def main(tumor_bams=None, normal_bams=None, cn_reference=None,
         baits=None, fasta=None, annotation=None,
         method='hybrid', is_male_normal=True, drop_low_coverage=False,
         antitarget_avg_size=150000, target_avg_size=267, do_parallel=True):

    if not tumor_bams and not normal_bams:
        raise dxpy.AppError("Must provide tumor_bams or normal_bams (or both)")
    if cn_reference and any((normal_bams, baits, fasta, annotation)):
        raise dxpy.AppError("Reference profile (cn_reference) cannot be used "
                            "alongside normal_bams, baits, fasta, "
                            "or annotation")
    if tumor_bams and not any((baits, cn_reference)):
        raise dxpy.AppError("Need cn_reference or baits to process tumor_bams")

    print("Downloading file inputs to the local file system")
    cn_reference = download_link(cn_reference)
    baits = download_link(baits)
    fasta = download_link(fasta)
    annotation = download_link(annotation)
    if tumor_bams is not None:
        tumor_bams = map(download_link, tumor_bams)
    if normal_bams is not None:
        normal_bams = map(download_link, normal_bams)

    # If these input files are gzipped, decompress them
    fasta = maybe_gunzip(fasta, "ref", "fa")
    annotation = maybe_gunzip(annotation, "annot", "txt")

    out_fnames = run_cnvkit(tumor_bams, normal_bams, cn_reference, baits,
                            fasta, annotation, method, is_male_normal,
                            drop_low_coverage, antitarget_avg_size,
                            target_avg_size, do_parallel)

    print("Uploading local file outputs to the DNAnexus platform")
    output = {}
    for filekey in ("cn_reference", "seg", "metrics", "genders", "scatter_pdf",
                    "diagram_pdf"):
        if filekey in out_fnames:
            output[filekey] = dxpy.dxlink(
                dxpy.upload_local_file(out_fnames[filekey]))
    for listkey in ("copy_ratios", "copy_segments", "gainloss", "breaks"):
        if listkey in out_fnames:
            output[listkey] = [dxpy.dxlink(dxpy.upload_local_file(fname))
                               for fname in out_fnames[listkey]]
    return output
Esempio n. 8
0
def ValidateBamConfig(bam_config_fn, bam_name_array):
    print "\nValidating bam config file"
    with open(bam_config_fn) as config_fh:
        for line in config_fh:
            name = line.split()[0]
            if name not in bam_name_array:
                raise dxpy.AppError(
                    "Bam config file contains filenames which do not match input bam files"
                )
    print "\tBam config file is valid"
    return True
Esempio n. 9
0
def main(**job_inputs):
    # If we weren't provided a mmi index for the reference, generate it.
    if 'genome_mmi' not in job_inputs:
        mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']}
        minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index')
        job_inputs['genome_mmi'] = minimap_index_job.get_output_ref(
            'genome_mmi')
    output = {'genome_mmi': job_inputs['genome_mmi']}

    # check if we're dealing with pacbio or ONT reads and what the filetype is
    datatype = job_inputs['datatype']
    one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name']
    try:
        file_ext = re.search("(bam|fastq|fasta|fa|fq){1}(.gz)?$",
                             one_reads_file,
                             flags=re.I).group(1).lower()
    except AttributeError:
        raise dxpy.AppError("Unknown filetype extension supplied.")

    if file_ext == 'bam':
        # input bam files must be pacbio raw reads
        if datatype == 'ONT':
            raise dxpy.AppError("Invalid file input for provided datatype.")

        # for bam input, run jobs using pbmm2
        jobs = run_pbmm2_subjobs(job_inputs)

    else:
        # for fasta and fastq inputs, run jobs using native minimap2
        if job_inputs['pbbamify']:
            print(
                'WARNING: The "Run pbbamify" option is only valid for BAM input'
            )
        jobs = run_minimap2_subjobs(job_inputs)

    output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs]
    output['bai_files'] = [
        j.get_output_ref('mapped_reads_index') for j in jobs
    ]

    return output
def check_reads(reads_tables):
    # validate that tables contain data that can be used together (all paired or all unpaired, etc)

    if len(reads_tables) == 0:
        raise dxpy.AppError("Please enter at least one Reads table as input")

    single = 0
    paired = 0

    for table in reads_tables:
        if 'sequence2' in dxpy.DXGTable(table).get_col_names():
            paired = paired + 1
        else:
            single = single + 1

    if single > 0 and paired > 0:
        raise dxpy.AppError(
            "Found both single and paired-end reads.  Please only input one type."
        )

    return
Esempio n. 11
0
def main(**kwargs):
    if len(kwargs) == 0:
        kwargs = vars(arg_parser.parse_args(sys.argv[1:]))

    try:
        spans = dxpy.DXGTable(kwargs['Spans'])
    except:
        raise dxpy.AppError("Failed to open Spans object for export")

    spans_types = spans.describe()['types']

    if 'Genes' in spans_types:
        export_genes(spans, kwargs['output'])
    else:
        export_generic_bed(spans, kwargs['output'])
Esempio n. 12
0
def validate_per_tumor(values, n_expected, title, criterion=None):
    """Ensure a per-tumor input array matches the number of tumor BAMs, etc.

    Also allow a value of None to skip checks & downstream processing, or a
    single value to apply to every tumor sample.

    Returns: list of values the same length as `n_expected`.
    """
    if values is None:
        out_vals = [None] * n_expected
    else:
        if criterion is not None and not all(map(criterion, values)):
            raise dxpy.AppError(
                """Tumor {} must all be between 0 and 1; got: {}""".format(
                    title, values))
        if len(values) == n_expected:
            out_vals = values
        elif len(values) == 1 and n_expected > 1:
            out_vals = [values] * n_expected
        else:
            raise dxpy.AppError("""Number of tumor {} specified ({}) does not
                match the number of tumor BAM files given ({})""".format(
                title, len(values), n_expected))
    return out_vals
Esempio n. 13
0
def find_delimiter(bed_file):
    with open(bed_file, "rU") as bf: 
        line = bf.readline()
        if line.startswith("track"):
            line = bf.readline()
        tab_split = line.split("\t")
        
        if len(tab_split) >= 3: 
            print("Bed file is tab delimited", file=sys.stderr)
            return "\t"
        else: 
            space_split = line.split()
            if len(space_split) < 3: 
                raise dxpy.AppError("File is not a valid bed file (neither space delimited nor tab delimited)")
            print("Bed file is space delimited", file=sys.stderr)
            return " "
Esempio n. 14
0
def convert_qual(qualString, qual_encode):
    convQualString = ''

    if qual_encode == 'phred64':
        #convert to phred33 do this by subtracting the difference in ASCII offsets
        #should be scaling values here? Lose some top end values by doing this
        for i in range(len(qualString)):
            convQualString += chr(ord(qualString[i]) - 31)
    elif qual_encode == 'qual_file':
        convQualString = ''.join(
            chr(int(i) + 33) for i in qualString.strip(' ').split(' '))
    elif qual_encode == 'phred33':
        convQualString = qualString
    else:
        raise dxpy.AppError(
            "Unknown quality encoding.  Supported encodings are Phred33 and Phred64."
        )

    return convQualString
Esempio n. 15
0
def main(**kwargs):
    mappings_ids = kwargs["mappings_files"]
    mappings_names = sorted([dxpy.describe(id)["name"] for id in mappings_ids])

    if "num_threads_per_instance" not in kwargs:
        kwargs["num_threads_per_instance"] = multiprocessing.cpu_count()
    if "num_instances" not in kwargs:
        kwargs["num_instances"] = 1

    # Set output prefix here
    if "output_prefix" not in kwargs:
        kwargs["output_prefix"] = mappings_names[0].rstrip('.bam').rstrip(
            '.txt')
    # Set output suffixes (for consistency through app)
    kwargs["variant_suffixes"] = {
        "deletions": 'D',
        "short_inserts": 'SI',
        "tandem_duplications": 'TD',
        "large_inserts": 'LI',
        "inversions": 'INV',
        "breakpoints": 'BP',
        #"breakdancer_outputs": 'BD',
        "close_mapped_reads": 'CloseEndMapped'
    }
    """
    if kwargs["export_vcf"]:
        print "\nTESTING pindel2vcf command line inputs on dummy inputs"
        ExportVCF(kwargs, output_path="/usr/test_vcf/dummy", ref_fn="/usr/test_vcf/dummy.fa")
    """
    # Check if input files have .bam extension
    if mappings_names[0][-4:] == ".bam":
        for name in mappings_names:
            if name[-4:] != ".bam":
                raise dxpy.AppError(
                    "Input mappings files are not all bam files with .bam extensions"
                )
        app_outputs = RunWithBamInput(kwargs=kwargs)
    else:
        app_outputs = RunWithPindelInput(kwargs=kwargs, sam2pindel=False)

    return app_outputs
def main(sam_file, probability):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    sam_file = dxpy.DXFile(sam_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(sam_file.get_id(), "sam_file")

    if probability < 0 or probability > 1:
        raise dxpy.AppError(
            "Probability parameter determines % of mappings included in output. Must be between 0 an 1."
        )

    subprocess.check_call(" ".join([
        "java", "-Xmx2g", "-jar", "/usr/local/bin/DownsampleSam.jar",
        "INPUT=sam_file", "OUTPUT=downsampled_sam",
        "PROBABILITY=" + str(probability)
    ]),
                          shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    downsampled_sam = dxpy.upload_local_file("downsampled_sam")
    downsampled_sam.rename(sam_file.describe()['name'] + "_downsampled")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["downsampled_sam"] = dxpy.dxlink(downsampled_sam)

    return output
Esempio n. 17
0
def iterate_reads(fastqa1_filename, fastqa2_filename, qual1_filename,
                  qual2_filename, is_fasta, is_colorspace, qual_encoding):
    fastqa1_iter = unpack_and_open(fastqa1_filename).__iter__()
    fastqa2_iter, qual1_iter, qual2_iter = None, None, None
    if fastqa2_filename != None:
        fastqa2_iter = unpack_and_open(fastqa2_filename).__iter__()
    if qual1_filename != None:
        qual1_iter = unpack_and_open(qual1_filename).__iter__()
    if qual2_filename != None:
        qual2_iter = unpack_and_open(qual2_filename).__iter__()

    read_iter = get_read(fastqa1_iter, qual1_iter, is_fasta, is_colorspace,
                         qual_encoding).__iter__()
    if fastqa1_filename != None:
        read_iter2 = get_read(fastqa2_iter, qual2_iter, is_fasta,
                              is_colorspace, qual_encoding).__iter__()

    try:
        while True:
            temp = read_iter.next()
            name1 = temp[0]
            seq1 = temp[1]
            qual1 = temp[2]
            #name1, seq1, qual1 = read_iter.next()
            name2, seq2, qual2 = None, None, None
            if fastqa2_filename != None:
                name2, seq2, qual2 = read_iter2.next()
            yield name1, seq1, qual1, name2, seq2, qual2
    except StopIteration:
        # check to make sure all files we're reading from are all finished at the same time
        for file_iter in fastqa1_iter, fastqa2_iter, qual1_iter, qual2_iter:
            if file_iter != None:
                try:
                    line = file_iter.next().rstrip("\n")
                    raise dxpy.AppError(
                        "Number of reads in each file must be equal")

                except StopIteration:
                    pass
Esempio n. 18
0
def RunWithPindelInput(kwargs, sam2pindel=False):
    pindel_config_fn = "pindel_config.txt"
    mappings_names = DownloadFilesFromArray(input_ids=kwargs["mappings_files"])

    if sam2pindel:
        print "\nInput was not produced by BWA/MOSAIK, running sam2pindel script on input BAM files"
        if "sequence_platform" not in kwargs:
            raise dxpy.AppError(
                "If BAM files were not produced by BWA, must ALSO specify which sequence platform was used to produce the mappings"
            )
        pindel_config_fn = RunSam2Pindel(
            bam_names=mappings_names,
            insert_size=kwargs["insert_size"],
            seq_platform=kwargs["sequence_platform"],
            num_threads=kwargs["num_threads_per_instance"],
            config_fn=pindel_config_fn)
    else:
        print "\nInput is pindel input. Making pindel configuration file"
        pindel_config_fn = WriteConfigFile(mappings_names=mappings_names,
                                           fn=pindel_config_fn,
                                           is_pindel=True)

    chrom = kwargs["chromosome"] if "chromosome" in kwargs else "ALL"
    command, output_path = BuildPindelCommand(kwargs=kwargs,
                                              chrom=chrom,
                                              input_fn=pindel_config_fn,
                                              is_pindel_input_type=True)
    output_path = RunPindel(kwargs=kwargs,
                            pindel_command=command,
                            output_path=output_path)

    app_outputs = UploadPindelOutputs(kwargs=kwargs, output_path=output_path)
    if kwargs["export_vcf"]:
        app_outputs["vcf"] = ExportVCF(kwargs=kwargs,
                                       output_path=output_path,
                                       ref_fn="reference_fasta")

    return app_outputs
Esempio n. 19
0
def map_contaminant(Contig, Reads):
    # get ID of our mapper
    try:
        bwa = dxpy.DXApp(
            dxpy.find_apps(name="bwa_mem_fastq_read_mapper").next()['id'])
    except StopIteration:
        raise dxpy.AppError(
            "Unable to find app 'bwa_mem_fastq_read_mapper'.  Please install it to enable contaminant mapping"
        )

    # TODO: find optimal chunk size so we don't launch too many bwa jobs
    map_job = bwa.run({
        "reads": Reads,
        "reference": Contig,
        "discard_unmapped_rows": True,
        "chunk_size": 10000000
    })

    total_reads = 0
    for r in Reads:
        desc = dxpy.DXGTable(r).describe()
        current_reads = desc['length']
        if 'sequence2' in desc['columns']:
            current_reads *= 2
        total_reads += current_reads

    # launch a job to wait for the mapping and will calculate what % has mapped
    calc_job = dxpy.new_dxjob(
        {
            "num_reads": total_reads,
            "mappings": {
                "job": map_job.get_id(),
                "field": "mappings"
            }
        }, "calc_contam")

    return calc_job.get_id()
Esempio n. 20
0
def importGTF(**args):

    if len(args) == 0:
        command_line_args = parser.parse_args(sys.argv[1:])
        fileName = command_line_args.fileName
        reference = command_line_args.reference
        outputName = command_line_args.outputName
        tag = command_line_args.tag
        property_key = command_line_args.property_key
        property_value = command_line_args.property_value
        additional_type = command_line_args.additional_type
        file_id = command_line_args.file_id
    else:
        fileName = args['fileName']
        reference = args['reference']
        outputName = ''
        if args.get('outputName') != None:
            outputName = args['outputName']
        tag = []
        if args.get('tag'):
            tag = args['tag']
        if args.get('property_key') != None:
            property_key = args['property_key']
        if args.get('property_value') != None:
            property_value = args['property_value']
        if args.get('additional_type') != None:
            additional_type = args['additional_type']
        if args.get('file_id') != None:
            file_id = args['file_id']

    inputFileName = unpack(fileName)

    capturedTypes = {
        "5UTR": "5' UTR",
        "3UTR": "3' UTR",
        "CDS": "CDS",
        "inter": "intergenic",
        "inter_CNS": "intergenic_conserved",
        "intron_CNS": "intron_conserved",
        "exon": "exon",
        "transcript": "transcript",
        "gene": "gene",
        "stop_codon": "stop_codon",
        "start_codon": "start_codon"
    }

    #Rows of this type will not be written to the gtable as their information is fully encompassed by the rest of the data

    ##Isolate the attribute tags from the file and check integrity
    spansTable, additionalColumns = constructTable(inputFileName)
    spansTable.add_tags(tag)

    types = ["Genes", "gri"]
    for x in additional_type:
        types.append(x)
    spansTable.add_types(types)
    details = {'original_contigset': dxpy.dxlink(reference)}

    if len(property_key) != len(property_value):
        raise dxpy.AppError(
            "Expected each provided property to have a corresponding value")
    for i in range(len(property_key)):
        details[property_key[i]] = property_value[i]
    for x in additional_type:
        types.append(x)

    if file_id != None:
        details['original_file'] = dxpy.dxlink(file_id)
    spansTable.set_details(details)
    if outputName == '':
        spansTable.rename(fileName)
    else:
        spansTable.rename(outputName)

    #This passes through the file calculates the gene and transcript models
    genes = {}
    transcripts = {}
    spanId = 0
    frames = {}
    stopCodons = {}

    inputFile = open(inputFileName, 'r')
    for line in inputFile:
        if line[0] != "#":
            values = parseLine(line, capturedTypes)

            if values["type"] == "CDS":
                if frames.get(values["transcriptId"]) == None:
                    frames[values["transcriptId"]] = {}
                frames[values["transcriptId"]][values["lo"]] = values["frame"]

            for [element, hashId, elementType
                 ] in [[genes, values["geneId"], "geneName"],
                       [transcripts, values["transcriptId"],
                        "transcriptName"]]:

                if element.get(hashId) == None:
                    element[hashId] = {
                        values["chromosome"]: {
                            "lo":
                            values["lo"],
                            "hi":
                            values["hi"],
                            "codingLo":
                            -1,
                            "codingHi":
                            -1,
                            "strand":
                            values["strand"],
                            "score":
                            values["score"],
                            "geneId":
                            values["geneId"],
                            "coding":
                            False,
                            "spanId":
                            spanId,
                            "name":
                            values[elementType],
                            "originalGeneId":
                            values["attributes"]["gene_id"],
                            "originalTranscriptId":
                            values["attributes"]["transcript_id"]
                        }
                    }
                    spanId += 1
                elif element[hashId].get(values["chromosome"]) == None:
                    element[hashId][values["chromosome"]] = {
                        "lo":
                        values["lo"],
                        "hi":
                        values["hi"],
                        "codingLo":
                        -1,
                        "codingHi":
                        -1,
                        "strand":
                        values["strand"],
                        "score":
                        values["score"],
                        "geneId":
                        values["geneId"],
                        "coding":
                        False,
                        "spanId":
                        spanId,
                        "name":
                        values[elementType],
                        "originalGeneId":
                        values["attributes"]["gene_id"],
                        "originalTranscriptId":
                        values["attributes"]["transcript_id"]
                    }
                    spanId += 1
                else:
                    if values["lo"] < element[hashId][
                            values["chromosome"]]["lo"]:
                        element[hashId][
                            values["chromosome"]]["lo"] = values["lo"]
                    if values["hi"] > element[hashId][
                            values["chromosome"]]["hi"]:
                        element[hashId][
                            values["chromosome"]]["hi"] = values["hi"]

            if values["type"] == "stop_codon":
                if stopCodons.get(values["transcriptId"]) == None:
                    stopCodons[values["transcriptId"]] = [[
                        values["lo"], values["hi"]
                    ]]
                else:
                    stopCodons[values["transcriptId"]].append(
                        [values["lo"], values["hi"]])

            if values["type"] == "CDS" or values[
                    "type"] == "start_codon" or values["type"] == "stop_codon":
                if values["hi"] > transcripts[values["transcriptId"]][
                        values["chromosome"]]["codingHi"]:
                    transcripts[values["transcriptId"]][
                        values["chromosome"]]["codingHi"] = values["hi"]
                if values["lo"] < transcripts[values["transcriptId"]][
                        values["chromosome"]]["codingLo"] or transcripts[
                            values["transcriptId"]][
                                values["chromosome"]]["codingLo"] == -1:
                    transcripts[values["transcriptId"]][
                        values["chromosome"]]["codingLo"] = values["lo"]
                genes[values["geneId"]][values["chromosome"]]["coding"] = True
                transcripts[values["transcriptId"]][
                    values["chromosome"]]["coding"] = True

    for gId, chrList in genes.iteritems():
        for k, v in chrList.iteritems():
            entry = [
                k, v["lo"], v["hi"], v["name"], v["spanId"], "gene",
                v["strand"], v["score"], v["coding"], -1, -1, '', '',
                v["originalGeneId"], ''
            ]
            for x in additionalColumns:
                if x != "gene_id" and x != "transcript_id":
                    entry.append('')
            spansTable.add_rows([entry])
    for gId, chrList in transcripts.iteritems():
        for k, v in chrList.iteritems():
            entry = [
                k, v["lo"], v["hi"], v["name"], v["spanId"], "transcript",
                v["strand"], v["score"], genes[v["geneId"]][k]["coding"],
                genes[v["geneId"]][k]["spanId"], -1, '', '',
                v["originalGeneId"], v["originalTranscriptId"]
            ]
            for x in additionalColumns:
                if x != "gene_id" and x != "transcript_id":
                    entry.append('')
            spansTable.add_rows([entry])

    exons = {}
    inputFile = open(inputFileName, 'r')

    for line in inputFile:
        if line[0] != "#":
            values = parseLine(line, capturedTypes)

            if exons.get(values["transcriptId"]) != None:
                if exons[values["transcriptId"]].get(
                        values["chromosome"]) == None:
                    exons[values["transcriptId"]][values["chromosome"]] = []
            else:
                exons[values["transcriptId"]] = {values["chromosome"]: []}

            if capturedTypes.get(values["type"]) != None:
                #If type is 5'UTR, 3'UTR, intergenic, or conserved intron, type is always noncoding
                if values["type"] == "5UTR" or values[
                        "type"] == "3UTR" or values[
                            "type"] == "inter" or values[
                                "type"] == "inter_CNS" or values[
                                    "type"] == "intron_CNS":
                    writeEntry(
                        spansTable, spanId, exons[values["transcriptId"]],
                        additionalColumns, values["chromosome"], values["lo"],
                        values["hi"], values["attributes"], [
                            values["chromosome"], values["lo"], values["hi"],
                            values["name"], spanId,
                            capturedTypes[values["type"]], values["strand"],
                            values["score"], False,
                            transcripts[values["transcriptId"]]["spanId"],
                            values["frame"], '', values["source"]
                        ])

                if "exon_number" in values["attributes"]:
                    values["transcriptName"] += "." + values["attributes"][
                        "exon_number"]

                #If type is CDS, always of type coding
                if values["type"] == "CDS":
                    if stopCodons.get(values["transcriptId"]) != None:
                        for x in stopCodons[values["transcriptId"]]:
                            if values["hi"] == x[0]:
                                values["hi"] = x[1]
                                break
                    if [values["lo"], values["hi"]] not in exons[
                            values["transcriptId"]][values["chromosome"]]:
                        spanId = writeEntry(
                            spansTable, spanId, exons[values["transcriptId"]],
                            additionalColumns, values["chromosome"],
                            values["lo"], values["hi"], values["attributes"], [
                                values["chromosome"], values["lo"],
                                values["hi"], values["transcriptName"], spanId,
                                capturedTypes[values["type"]],
                                values["strand"], values["score"], True,
                                transcripts[values["transcriptId"]][
                                    values["chromosome"]]["spanId"],
                                values["frame"], '', values["source"]
                            ])

                #If type is exon do calculation as to whether coding or non-coding
                if values["type"] == "stop_codon":
                    values["type"] = "exon"
                    values["frame"] = 3 - (values["hi"] - values["lo"])
                    #if values["strand"] == "-":
                    #    values["lo"] = transcripts[values["transcriptId"]][values["chromosome"]]["lo"]
                    #else:
                    #    values["hi"] = transcripts[values["transcriptId"]][values["chromosome"]]["hi"]

                if values["type"] == "exon":
                    if (transcripts[values["transcriptId"]][
                            values["chromosome"]]["codingLo"] != -1
                            and transcripts[values["transcriptId"]][
                                values["chromosome"]]["codingHi"] != -1):
                        if frames.get(values["transcriptId"]) != None:
                            if frames[values["transcriptId"]].get(
                                    values["lo"]) != None:
                                values["frame"] = frames[
                                    values["transcriptId"]][values["lo"]]

                        for x in splitExons(
                                transcripts[values["transcriptId"]],
                                values["chromosome"], values["lo"],
                                values["hi"], values["strand"]):
                            spanId = writeEntry(
                                spansTable, spanId,
                                exons[values["transcriptId"]],
                                additionalColumns, values["chromosome"], x[1],
                                x[2], values["attributes"], [
                                    values["chromosome"], x[1], x[2],
                                    values["transcriptName"], spanId, x[0],
                                    values["strand"], values["score"], x[3],
                                    transcripts[values["transcriptId"]][
                                        values["chromosome"]]["spanId"],
                                    values["frame"], '', values["source"]
                                ])
                    else:
                        spanId = writeEntry(
                            spansTable, spanId, exons[values["transcriptId"]],
                            additionalColumns, values["chromosome"],
                            values["lo"], values["hi"], values["attributes"], [
                                values["chromosome"], values["lo"],
                                values["hi"], values["transcriptName"], spanId,
                                capturedTypes[values["type"]],
                                values["strand"], values["score"], False,
                                transcripts[values["transcriptId"]][
                                    values["chromosome"]]["spanId"],
                                values["frame"], '', values["source"]
                            ])

    spansTable.flush()
    spansTable.close()
    outputFile = open("result.txt", 'w')
    outputFile.write(spansTable.get_id())
    outputFile.close()
    print(spansTable.get_id())
    return spansTable.get_id()
Esempio n. 21
0
def constructTable(inputFileName):
    inputFile = open(inputFileName, 'r')
    attributes = {"gene_id": True, "transcript_id": True}
    for line in inputFile:
        if line[0] != "#":
            tabSplit = line.split("\t")
            if len(tabSplit) == 1:
                tabSplit = line.split(" ")
                if len(tabSplit) < 9:
                    raise dxpy.AppError(
                        "One row did not have 9 entries, it had 1 instead. Offending line: "
                        + line)
                tabSplit[8] = " ".join(tabSplit[8:])
                tabSplit = tabSplit[:9]

            if len(tabSplit) != 9:
                raise dxpy.AppError("One row did not have 9 entries, it had " +
                                    str(len(tabSplit)) +
                                    " instead. Offending line: " + line)
            else:
                entrySplit = tabSplit[8].split(";")
                geneIdPresent = False
                transcriptIdPresent = False
                result = []
                for x in entrySplit:
                    keyValue = x.strip().split(" ")
                    key = keyValue[0]
                    if key == "gene_id":
                        geneIdPresent = True
                    elif key == "transcript_id":
                        transcriptIdPresent = True
                    attributes[key] = True
            if not geneIdPresent:
                raise dxpy.AppError(
                    "One row did not have a gene_id Offending line: " + line)
            if not transcriptIdPresent:
                raise dxpy.AppError(
                    "One row did not have a gene_id Offending line: " + line)

    #Construct table
    schema = [{
        "name": "chr",
        "type": "string"
    }, {
        "name": "lo",
        "type": "uint32"
    }, {
        "name": "hi",
        "type": "uint32"
    }, {
        "name": "name",
        "type": "string"
    }, {
        "name": "span_id",
        "type": "int32"
    }, {
        "name": "type",
        "type": "string"
    }, {
        "name": "strand",
        "type": "string"
    }, {
        "name": "score",
        "type": "float"
    }, {
        "name": "is_coding",
        "type": "boolean"
    }, {
        "name": "parent_id",
        "type": "int32"
    }, {
        "name": "frame",
        "type": "int16"
    }, {
        "name": "description",
        "type": "string"
    }, {
        "name": "source",
        "type": "string"
    }, {
        "name": "gene_id",
        "type": "string"
    }, {
        "name": "transcript_id",
        "type": "string"
    }]

    additionalColumns = ['gene_id', 'transcript_id']
    for k, v in attributes.iteritems():
        if k != '' and k != 'gene_id' and k != 'transcript_id' and len(
                k) < 100:
            schema.append({"name": k, "type": "string"})
            additionalColumns.append(k)

    indices = [
        dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", 'gri'),
        dxpy.DXGTable.lexicographic_index([
            dxpy.DXGTable.lexicographic_index_column("name", True, False),
            dxpy.DXGTable.lexicographic_index_column("chr"),
            dxpy.DXGTable.lexicographic_index_column("lo"),
            dxpy.DXGTable.lexicographic_index_column("hi"),
            dxpy.DXGTable.lexicographic_index_column("type")
        ], "search")
    ]
    spansTable = dxpy.new_dxgtable(columns=schema, indices=indices)
    return spansTable, additionalColumns
Esempio n. 22
0
def parseLine(line, capturedTypes):
    tabSplit = line.split("\t")
    if len(tabSplit) == 1:
        tabSplit = line.split(" ")
        if len(tabSplit) < 9:
            raise dxpy.AppError(
                "One row did not have 9 entries, it had 1 instead. Offending line: "
                + line)
        tabSplit[8] = " ".join(tabSplit[8:])
        tabSplit = tabSplit[:9]
    chromosome = tabSplit[0]
    source = tabSplit[1]
    typ = tabSplit[2]
    if capturedTypes.get(typ) == None:
        message = 'Permitted types: ' + " ,".join(capturedTypes.keys())
        raise dxpy.AppError(
            "One row had a type which is not in the list of permitted types. "
            + message + "\nOffending line: " + line + "\nOffending type: " +
            typ)

    try:
        score = float(tabSplit[5])
    except ValueError:
        if tabSplit[5] == "." or tabSplit[5] == '':
            score = dxpy.NULL
        else:
            raise dxpy.AppError(
                "The score for one line could not be translated into a number and was not \".\""
                + "\nOffending line: " + line + "\nOffending value: " +
                tabSplit[5])

    if tabSplit[6] != "+" and tabSplit[6] != "-" and tabSplit[6] != ".":
        raise dxpy.AppError(
            "The strand indicated for an element was not \"+\", \"-\", or \".\""
            + "\nOffending line: " + line + "\nOffending value: " +
            tabSplit[6])
    else:
        strand = tabSplit[6]

    try:
        lo = int(tabSplit[3]) - 1
    except ValueError:
        raise dxpy.AppError(
            "One of the start values was could not be translated to an integer. "
            + "\nOffending line: " + line + "\nOffending value: " +
            tabSplit[3])

    try:
        hi = int(tabSplit[4])
    except ValueError:
        raise dxpy.AppError(
            "One of the start values was could not be translated to an integer. "
            + "\nOffending line: " + line + "\nOffending value: " +
            tabSplit[4])

    try:
        frame = int(tabSplit[7])
        if frame > 2 or frame < 0:
            raise dxpy.AppError(
                "The frame indicated for an element was not \".\", \"0\", \"1\", or \"2\""
                + "\nOffending line: " + line + "\nOffending value: " +
                tabSplit[7])
    except ValueError:
        if tabSplit[7] == ".":
            frame = -1
        else:
            raise dxpy.AppError(
                "The frame indicated for an element was not \".\", \"0\", \"1\", or \"2\""
                + "\nOffending line: " + line + "\nOffending value: " +
                tabSplit[7])

    lineAttributes = {}

    ##Extract the attributes from the file
    entrySplit = tabSplit[8].split(";")
    result = []
    for x in entrySplit:
        keyValue = x.strip().split(" ")
        key = keyValue[0]
        if key != '':
            if len(key) < 100:
                lineAttributes[key.strip('"')] = keyValue[1].strip('"')

    geneId = lineAttributes["gene_id"]
    transcriptId = lineAttributes["transcript_id"]

    geneName = geneId
    if "gene_name" in lineAttributes:
        geneName = lineAttributes["gene_name"]
    transcriptName = transcriptId
    if "transcript_name" in lineAttributes:
        transcriptName = lineAttributes["transcript_name"]

    values = {
        "chromosome": chromosome,
        "lo": lo,
        "hi": hi,
        "geneName": geneName,
        "transcriptName": transcriptName,
        "source": source,
        "type": typ,
        "strand": strand,
        "score": score,
        "frame": frame,
        "geneId": geneId,
        "transcriptId": transcriptId,
        "attributes": lineAttributes
    }
    return values
Esempio n. 23
0
def unpack(input):
    m = magic.Magic()

    # determine compression format
    try:
        file_type = m.from_file(input)
    except Exception as e:
        raise dxpy.AppError("Error while identifying compression format: " + str(e))
    
    # if we find a tar file throw a program error telling the user to unpack it
    if file_type == 'application/x-tar':
        raise dxpy.AppError("App does not support tar files.  Please unpack.")

    # since we haven't returned, the file is compressed.  Determine what program to use to uncompress
    uncomp_util = None
    if file_type == 'XZ compressed data':
        uncomp_util = 'xzcat'
    elif file_type[:21] == 'bzip2 compressed data':
        uncomp_util = 'bzcat'
    elif file_type[:20] == 'gzip compressed data':
        uncomp_util = 'zcat'
    elif file_type == 'POSIX tar archive (GNU)' or 'tar' in file_type:
        raise dxpy.AppError("Found a tar archive.  Please untar your sequences before importing")
    else:
        # just return input filename since it's already uncompressed
        return input

    if uncomp_util != None:        
        # bzcat does not support -t.  Use non streaming decompressors for testing input
        test_util = None
        if uncomp_util == 'xzcat':
            test_util = 'xz'
        elif uncomp_util == 'bzcat':
            test_util = 'bzip2'
        elif uncomp_util == 'zcat':
            test_util = 'gzip'

        try:
            subprocess.check_call(" ".join([test_util, "-t", input]), shell=True)
        except subprocess.CalledProcessError:
            raise dxpy.AppError("File failed integrity check by "+uncomp_util+".  Compressed file is corrupted.")

    # with that in hand, unzip file.  If we find a tar archive then exit with error.
    try:
        with subprocess.Popen([uncomp_util, input], stdout=subprocess.PIPE).stdout as pipe:
            line = pipe.next()
        uncomp_type = m.from_buffer(line)
    except Exception as e:
        raise dxpy.AppError("Error detecting file format after decompression: " + str(e))

    if uncomp_type == 'POSIX tar archive (GNU)' or 'tar' in uncomp_type:
        raise dxpy.AppError("Found a tar archive after decompression.  Please untar your files before importing")
    elif 'ASCII text' not in uncomp_type:
        raise dxpy.AppError("After decompression found file type other than plain text")
    
    try:
        out_name = id_generator()
        subprocess.check_call(" ".join([uncomp_util, "--stdout", input, ">", out_name]), shell=True)
        return out_name
    except subprocess.CalledProcessError as e:
        raise dxpy.AppError("Unable to open compressed input for reading: " + str(e))
Esempio n. 24
0
def importGFF(**args):

    if len(args) == 0:
        args = parser.parse_args(sys.argv[1:])
        fileName = args.fileName
        reference = args.reference
        outputName = args.outputName
        file_id = args.file_id
        property_key = args.property_key
        property_value = args.property_value
        tag = args.tag
        additional_type = args.additional_type

    else:
        fileName = args['fileName']
        reference = args['reference']
        outputName = ''
        if args.get('outputName') != None:
            outputName = args['outputName']
        tag = []
        if args.get('tag'):
            tag = args['tag']
        if args.get('property_key') != None:
            property_key = args['property_key']
        if args.get('property_value') != None:
            property_value = args['property_value']
        if args.get('additional_type') != None:
            additional_type = args['additional_type']
        if args.get('file_id') != None:
            file_id = args['file_id']

    inputFileName = unpack(fileName)

    #Rows of this type will not be written to the gtable as their information is fully encompassed by the rest of the data
    discardedTypes = {"start_codon": True, "stop_codon": True}

    ##Isolate the attribute tags from the file and check integrity
    spansTable, additionalColumns = constructTable(inputFileName)

    details = {'original_contigset': dxpy.dxlink(reference)}
    if file_id != None:
        details['original_file'] = dxpy.dxlink(file_id)
    if len(property_key) != len(property_value):
        raise dxpy.AppError(
            "Expected each provided property to have a corresponding value.")
    for i in range(len(property_key)):
        details[property_key[i]] = property_value[i]

    spansTable.set_details(details)
    spansTable.add_tags(tag)

    if outputName == '':
        spansTable.rename(fileName)
    else:
        spansTable.rename(outputName)

    hasGenes = False

    #This pass through the file calculates the gene and transcript models
    genes = {}
    transcripts = {}
    spanId = 0

    sequenceOntology = {}
    for x in [
            "five_prime_UTR", "5' UTR", "five prime UTR",
            "five_prime_untranslated_region",
            "five_prime_coding_exon_noncoding_region",
            "five_prime_exon_noncoding_region",
            "five prime coding exon noncoding region"
    ]:
        sequenceOntology[x] = "5' UTR"
    for x in [
            "three_prime_UTR", "3' UTR", "three prime UTR",
            "three_prime_untranslated_region",
            "three_prime_coding_exon_noncoding_region",
            "three_prime_exon_noncoding_region",
            "three prime coding exon noncoding region"
    ]:
        sequenceOntology[x] = "3' UTR"
    for x in [
            "mRNA", "rRNA", "tRNA", "snRNA", "snoRNA", "miRNA", "ncRNA",
            "transcript", "mature_transcript",
            "rRNA_large_subunit_primary_transcript",
            "35S rRNA primary transcript",
            "rRNA large subunit primary transcript", "rRNA_primary_transcript",
            "enzymatic_RNA", "nc_primary_transcript", "scRNA",
            "protein_coding_primary_transcript", "antisense_RNA",
            "antisense_primary_transcript", "primary_transcript",
            "ribosomal_subunit_rRNA", "small subunit rRNA", "SSU RNA",
            "SSU rRNA", "large_subunit_rRNA", "LSU RNA", "LSU rRNA"
    ]:
        sequenceOntology[x] = "transcript"
    for x in [
            "exon", "interior_coding_exon", "interior coding exon",
            "coding_exon", "coding exon", "five_prime_coding_exon_region",
            "five prime exon coding region", "three_prime_coding_exon_region",
            "three prime coding exon region", "five_prime_coding_exon",
            "three_prime_coding_exon", "non_coding_exon", "non coding exon"
    ]:
        sequenceOntology[x] = "exon"

    isCoding = {}
    for x in [
            "CDS", "interior_coding_exon", "interior coding exon",
            "coding_exon", "five_prime_coding_exon_region",
            "five prime exon coding region", "three_prime_coding_exon_region",
            "three prime coding exon region", "five_prime_coding_exon",
            "three_prime_coding_exon"
    ]:
        isCoding[x] = True

    codingRegions = {}
    spans = {}

    inputFile = open(inputFileName, 'r')
    for line in inputFile:
        if line[0] != "#":
            values = parseLine(line.split("#")[0])

            if values["attributes"].get("Parent") != None:
                for parent in values["attributes"]["Parent"].split(","):
                    if codingRegions.get(parent) == None:
                        codingRegions[parent] = {
                            values["chromosome"]: {
                                "codingLo": -1,
                                "codingHi": -1
                            }
                        }
                    if isCoding.get(values["type"]) != None:
                        if values["lo"] < codingRegions[parent][values[
                                "chromosome"]]["codingLo"] or codingRegions[
                                    parent][values["chromosome"]][
                                        "codingLo"] == -1:
                            codingRegions[parent][values["chromosome"]][
                                "codingLo"] = values["lo"]
                        if values["hi"] > codingRegions[parent][values[
                                "chromosome"]]["codingHi"] or codingRegions[
                                    parent][values["chromosome"]][
                                        "codingLo"] == -1:
                            codingRegions[parent][values["chromosome"]][
                                "codingHi"] = values["hi"]
            if values["attributes"].get("ID") != None:
                spans[values["attributes"]["ID"]] = spanId
            spanId += 1

    inputFile = open(inputFileName, 'r')
    overflowSpans = spanId
    spanId = 0

    for line in inputFile:
        if line[0] != "#":
            values = parseLine(line)
            entryIsCoding = False
            if isCoding.get(values["type"]) != None:
                entryIsCoding = True
            if values["attributes"].get("Name") != None:
                name = values["attributes"]["Name"]
            elif values["attributes"].get("name") != None:
                name = values["attributes"]["name"]
            elif values["attributes"].get("NAME") != None:
                name = values["attributes"]["NAME"]
            elif values["attributes"].get("ID") != None:
                name = values["attributes"]["ID"]
            else:
                name = ''
            if sequenceOntology.get(values["type"]) != None:
                values["type"] = sequenceOntology[values["type"]]
                hasGenes = True
            description = ''
            if values["attributes"].get("description") != None:
                description = values["attributes"]["description"]
            if values["attributes"].get("Description") != None:
                description = values["attributes"]["description"]

            parent = -1
            if values["type"] not in discardedTypes:
                if values["attributes"].get("Parent") != None:
                    parentSplit = values["attributes"]["Parent"].split(",")
                else:
                    parentSplit = ["-1"]
                for parent in parentSplit:
                    currentSpan = spanId
                    parentId = -1
                    if spans.get(parent) != None:
                        parentId = spans[parent]
                    if parentSplit.index(parent) > 0:
                        currentSpan = overflowSpans
                        overflowSpans += 1
                    for x in ["ID", "Parent"]:
                        if not entryIsCoding and values["attributes"].get(
                                x) != None:
                            if codingRegions.get(
                                    values["attributes"][x]) != None:
                                if codingRegions[values["attributes"][x]].get(
                                        "chromosome") != None:
                                    if values["lo"] >= codingRegions[
                                            values["attributes"]
                                        [x]]["chromosome"]["codingLo"] and values[
                                            "lo"] <= codingRegions[
                                                values["attributes"]
                                                [x]]["chromosome"]["codingHi"] and codingRegions[
                                                    values["attributes"]
                                                    [x]]["chromosome"][
                                                        "codingHi"] > -1 and codingRegions[
                                                            values["attributes"]
                                                            [x]]["chromosome"][
                                                                "codingHi"] > -1:
                                        entryIsCoding = True
                                    if values["hi"] >= codingRegions[
                                            values["attributes"]
                                        [x]]["chromosome"]["codingLo"] and values[
                                            "hi"] <= codingRegions[
                                                values["attributes"]
                                                [x]]["chromosome"]["codingHi"] and codingRegions[
                                                    values["attributes"]
                                                    [x]]["chromosome"][
                                                        "codingHi"] > -1 and codingRegions[
                                                            values["attributes"]
                                                            [x]]["chromosome"][
                                                                "codingHi"] > -1:
                                        entryIsCoding = True
                entry = [
                    values["chromosome"], values["lo"], values["hi"], name,
                    currentSpan, values["type"], values["strand"],
                    values["score"], entryIsCoding, parentId, values["frame"],
                    description, values["source"]
                ]
                for x in additionalColumns:
                    if values["attributes"].get(x) != None:
                        entry.append(values["attributes"][x])
                    else:
                        entry.append('')
                spansTable.add_rows([entry])
            spanId += 1

    if hasGenes:
        types = ["Genes", "gri"]
    else:
        types = ["Spans", "gri"]
    for x in additional_type:
        types.append(x)
    spansTable.add_types(types)
    spansTable.flush()
    spansTable.close()
    print(spansTable.get_id())
    job_outputs = dxpy.dxlink(spansTable.get_id())
    return job_outputs
Esempio n. 25
0
def import_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, isBedDetail, delimiter="\t"):
    num_cols = find_num_columns(bed_file, delimiter)

    # if this is a bedDetail file we should treat the last two columns separately
    if isBedDetail:
        num_cols -= 2
    
    possible_columns = [("chr", "string"),
                        ("lo", "int32"),
                        ("hi", "int32"),
                        ("name", "string"),
                        ("score", "float"),
                        ("strand", "string"),
                        ("thick_start", "int32"),
                        ("thick_end", "int32"),
                        ("item_rgb", "string")]

    bedDetail_columns = [("bedDetail_ID", "string"),
                         ("bedDetail_desc", "string")]

    possible_default_row = ["", 0, 0, "", 0, ".", 0, 0, ""]

    columns = possible_columns[:num_cols]

    if isBedDetail:
        columns.extend(bedDetail_columns)

    if num_cols > len(columns):
        for i in range(len(columns), num_cols):
            columns.append(("BED_column_"+str(i+1), "string"))
            possible_default_row.append("")

    default_row = possible_default_row[:num_cols]

    if isBedDetail:
        default_row.extend(["",""])

    column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns]
    
    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')]
    for c in columns:
        if "name" in c:
            indices.append(dxpy.DXGTable.lexicographic_index([
                              dxpy.DXGTable.lexicographic_index_column("name", True, False),
                              dxpy.DXGTable.lexicographic_index_column("chr"),
                              dxpy.DXGTable.lexicographic_index_column("lo"),
                              dxpy.DXGTable.lexicographic_index_column("hi")], "search"))
            break
            
    with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span:
        details = {"original_contigset": dxpy.dxlink(ref_id)}
        if file_id != None:
            details["original_file"] = dxpy.dxlink(file_id)
        if len(property_keys) != len(property_values):
            raise dxpy.AppError("Expected each provided property to have a corresponding value.")
        for i in range(len(property_keys)):
            details[property_keys[i]] = property_values[i]    
    
        span.set_details(details)

        span.add_types(["Spans", "gri"])
        span.rename(table_name)

        for line in bed:
            row = list(default_row)

            if line.startswith("track"):
                details = span.get_details()
                details['track'] = line
                span.set_details(details)
                continue
            line = line.rstrip("\n")
            line = line.split(delimiter)
            if isBedDetail:
                # only the first 4 columns are guaranteed to be defined by UCSC
                validate_line(line[:4])
                # save last two fields separately
                bedDetailFields = line[-2:]
                line = line[:-2]     
            else:        
                validate_line(line[:num_cols])
            
            # check to see if this is a weird line
            if len(line) == 0:
                break
            if len(line) < 3:
                raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns.  Invalid BED file.")

            try:
                row[0] = line[0]
                row[1] = int(line[1])
                row[2] = int(line[2])
                row[3] = line[3]
                # dashes are sometimes used when field is invalid
                if line[4] == "-" or line[4] == ".":
                    line[4] = 0
                row[4] = float(line[4])
                row[5] = line[5]
                # dashes are sometimes used when field is invalid
                if line[6] == "-" or line[6] == ".":
                    line[6] = 0
                row[6] = int(line[6])
                # dashes are sometimes used when field is invalid
                if line[7] == "-" or line[7] == ".":
                    line[7] = 0
                row[7] = int(line[7])
                row[8] = line[8]

            # an index error would come from having fewer columns in a row, which we should handle ok
            except IndexError:
                pass
            # value error when fields are messed up and string gets converted to int, etc.  Throw these out.
            except ValueError:
                continue
            
            if isBedDetail:
                # add these in at the end if we have a bedDetail file
                row[num_cols] = bedDetailFields[0]
                row[num_cols+1] = bedDetailFields[1]
            
            span.add_row(row)

        span.flush()

    return dxpy.dxlink(span.get_id())
Esempio n. 26
0
def process(project_id,
            output_folder,
            fastq_file,
            genome_fasta_file,
            genome_index_file,
            mapper,
            mark_duplicates,
            fastq_file2=None,
            sample_name=None,
            properties=None):
    """Download a single FASTQ file, map it, and output a coordinate-sorted
    BAM file."""

    logger = []
    bams_subfolder = output_folder + '/bams'

    if mapper not in SUPPORTED_MAPPERS:
        raise dxpy.AppError("Unsupported mapper: " + mapper)

    if mapper == "bwa_mem":
        if fastq_file2 == None:
            run_bwa_mem_single(fastq_file, genome_fasta_file,
                               genome_index_file, mark_duplicates, logger)
        else:
            run_bwa_mem_paired(fastq_file, fastq_file2, genome_fasta_file,
                               genome_index_file, mark_duplicates, logger)
    elif mapper == "bwa" or mapper == "bwa_aln":
        if fastq_file2 == None:
            run_bwa_backtrack_single(fastq_file, genome_fasta_file,
                                     genome_index_file, mark_duplicates,
                                     logger)
        else:
            run_bwa_backtrack_paired(fastq_file, fastq_file2,
                                     genome_fasta_file, genome_index_file,
                                     mark_duplicates, logger)
    else:
        raise dxpy.AppError("Unsupported mapper: " + mapper)

    run_samtools_calmd(logger)
    ''' From bwa_mem_fastq_read_mapper bash source:
    bwa mem -t `nproc` "$genome_file" $input $opts | samtools view -u -S - | samtools sort -m 256M -@ `nproc` - output
    samtools index output.bam
    '''
    index_cmd = 'samtools index sample.bam'
    run_cmd(index_cmd, logger)

    bam_file = dxpy.upload_local_file(filename="sample.bam",
                                      name=sample_name + ".bam",
                                      properties=properties,
                                      project=project_id,
                                      folder=bams_subfolder,
                                      parents=True)

    bai_file = dxpy.upload_local_file(filename="sample.bam.bai",
                                      name=sample_name + ".bai",
                                      properties=properties,
                                      project=project_id,
                                      folder=bams_subfolder,
                                      parents=True)

    return {
        "bam": dxpy.dxlink(bam_file),
        "bai": dxpy.dxlink(bai_file),
        "tools_used": logger
    }
Esempio n. 27
0
def import_genes(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, delimiter="\t"):
    # implement BED importing from this format:
    # http://genome.ucsc.edu/FAQ/FAQformat.html#format1

    columns = [("chr", "string"),
               ("lo", "int32"),
               ("hi", "int32"),
               ("name", "string"),
               ("span_id", "int32"),
               ("type", "string"),
               ("strand", "string"),
               ("is_coding", "boolean"),
               ("parent_id", "int32"),
               ("frame", "int16"),
               ("description", "string")]

    column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns]
    
    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), 
               dxpy.DXGTable.lexicographic_index([
                  dxpy.DXGTable.lexicographic_index_column("name", True, False),
                  dxpy.DXGTable.lexicographic_index_column("chr"),
                  dxpy.DXGTable.lexicographic_index_column("lo"),
                  dxpy.DXGTable.lexicographic_index_column("hi"),
                  dxpy.DXGTable.lexicographic_index_column("type")], "search")]

    default_row = ["", 0, 0, "", -1, "", ".", False, -1, -1, ""]

    with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span:
        span_table_id = span.get_id()

        details = {"original_contigset": dxpy.dxlink(ref_id)}
        if file_id != None:
            details["original_file"] = dxpy.dxlink(file_id)
        if len(property_keys) != len(property_values):
            raise dxpy.AppError("Expected each provided property to have a corresponding value.")
        for i in range(len(property_keys)):
            details[property_keys[i]] = property_values[i]
        span.set_details(details)

        span.add_types(["gri", "Genes"])
        span.rename(table_name)

        current_span_id = 0

        # where the parsing magic happens
        for line in bed:
            if line.startswith("track"):
                details = span.get_details()
                details['track'] = line
                span.set_details(details)
                continue
            line = line.rstrip("\n")
            row = list(default_row)
            line = line.split(delimiter)
            validate_line(line)
            if len(line) < 12:
                raise dxpy.AppError("Line: "+"\t".join(line)+" in gene model-like BED file contains less than 12 columns.  Invalid BED file.")

            # add parent gene track
            row = generate_gene_row(line, 0, 0, "transcript", default_row, -1, current_span_id)
            if row != None:
                span.add_row(row)
                current_parent_id = current_span_id
                current_span_id += 1          
                
                # add all children
                blockCount = int(line[9])
                line[10] = line[10].rstrip(",").split(",")
                blockSizes = [int(line[10][n]) for n in range(blockCount)]
                line[11] = line[11].rstrip(",").split(",")
                blockStarts = [int(line[11][n]) for n in range(blockCount)]

                gene_lo = int(line[1])
                gene_hi = int(line[2])

                # set thick* to be within the gene if outside
                thickStart = min(max(int(line[6]), gene_lo), gene_hi)
                thickEnd = max(min(int(line[7]), gene_hi), gene_lo)
                
                for i in range(blockCount):
                    # look to thickStart and thickEnd to get information about the type of this region
                    # if thick* are the same or cover the whole transcript then we ignore them
                    # else, we partition the exons into CDS and UTR based on their boundaries
                    if thickStart == thickEnd or (thickStart == gene_lo and thickEnd == gene_hi):
                        span.add_row(generate_gene_row(line, 
                                                       blockSizes[i], 
                                                       blockStarts[i], 
                                                       "exon", 
                                                       default_row, 
                                                       current_parent_id, 
                                                       current_span_id))
                        current_span_id += 1
                    else:
                        exon_lo = int(line[1])+blockStarts[i]
                        exon_hi = int(exon_lo+blockSizes[i])

                        # we're all UTR if we enter either of these
                        if (exon_hi <= thickStart and line[5] == '+') or (exon_lo >= thickEnd and line[5] == '-'):
                            span.add_row(generate_gene_row(line, 
                                                           blockSizes[i], 
                                                           blockStarts[i], 
                                                           "5' UTR", 
                                                           default_row, 
                                                           current_parent_id, 
                                                           current_span_id))
                            current_span_id += 1
                        elif (exon_hi <= thickStart and line[5] == '-') or (exon_lo >= thickEnd and line[5] == '+'):
                            span.add_row(generate_gene_row(line, 
                                                           blockSizes[i], 
                                                           blockStarts[i], 
                                                           "3' UTR", 
                                                           default_row, 
                                                           current_parent_id, 
                                                           current_span_id))
                            current_span_id += 1

                        # if this is true then we overlap CDS partially or completely
                        elif (exon_lo < thickEnd and exon_hi > thickStart):
                            # entirely contained
                            if exon_lo >= thickStart and exon_hi <= thickEnd:
                                span.add_row(generate_gene_row(line, 
                                                               blockSizes[i], 
                                                               blockStarts[i], 
                                                               "CDS", 
                                                               default_row, 
                                                               current_parent_id, 
                                                               current_span_id))
                                current_span_id += 1
                            else:
                                # left portion is UTR
                                if exon_lo < thickStart:
                                    if line[5] == '+':
                                        UTR_type = "5' UTR"
                                    else:
                                        UTR_type = "3' UTR"
                                    UTR_size = (min(blockSizes[i], thickStart - exon_lo))
                                    span.add_row(generate_gene_row(line, 
                                                                   UTR_size, 
                                                                   blockStarts[i], 
                                                                   UTR_type,
                                                                   default_row, 
                                                                   current_parent_id, 
                                                                   current_span_id))
                                    current_span_id += 1

                                # CDS portion
                                CDS_size = blockSizes[i] - (max(exon_lo, thickStart) - exon_lo)
                                CDS_size -= (exon_hi - min(exon_hi, thickEnd))
                                CDS_start = (max(exon_lo, thickStart) - exon_lo) + blockStarts[i]
                                span.add_row(generate_gene_row(line, 
                                                               CDS_size, 
                                                               CDS_start, 
                                                               "CDS",
                                                               default_row, 
                                                               current_parent_id, 
                                                               current_span_id))
                                current_span_id += 1

                                # right portion is UTR
                                if exon_hi > thickEnd:
                                    if line[5] == '+':
                                        UTR_type = "3' UTR"
                                    else:
                                        UTR_type = "5' UTR"
                                    UTR_size = (min(blockSizes[i], exon_hi - thickEnd))
                                    UTR_start = blockStarts[i] + thickEnd - exon_lo
                                    span.add_row(generate_gene_row(line, 
                                                                   UTR_size, 
                                                                   UTR_start, 
                                                                   UTR_type,
                                                                   default_row, 
                                                                   current_parent_id, 
                                                                   current_span_id))
                                    current_span_id += 1

    return dxpy.dxlink(span.get_id())
Esempio n. 28
0
def parseLine(line):
    line = line.strip().split("#")[0]
    tabSplit = line.split("\t")
    if len(tabSplit) == 1:
        tabSplit = line.split(" ")
        if len(tabSplit) < 8:
            raise dxpy.AppError(
                "One row did not have 8 or 9 entries, it had 1 instead. Offending line: "
                + line)
        tabSplit[8] = " ".join(tabSplit[8:])
        tabSplit = tabSplit[:9]
    chromosome = tabSplit[0]
    source = tabSplit[1]
    typ = tabSplit[2]

    try:
        lo = int(tabSplit[3]) - 1
    except ValueError:
        raise dxpy.AppError(
            "One of the start values was could not be translated to an integer. "
            + "\nOffending line: " + line + "\nOffending value: " +
            tabSplit[3])

    try:
        hi = int(tabSplit[4])
    except ValueError:
        raise dxpy.AppError(
            "One of the start values was could not be translated to an integer. "
            + "\nOffending line: " + line + "\nOffending value: " +
            tabSplit[4])

    try:
        score = float(tabSplit[5])
    except ValueError:
        if tabSplit[5] == "." or tabSplit[5] == '':
            score = dxpy.NULL
        else:
            raise dxpy.AppError(
                "The score for one line could not be translated into a number and was not \".\""
                + "\nOffending line: " + line + "\nOffending value: " +
                tabSplit[5])

    tabSplit[6] = tabSplit[6].replace("?", ".")
    if tabSplit[6] != "+" and tabSplit[6] != "-" and tabSplit[6] != ".":
        raise dxpy.AppError(
            "The strand indicated for an element was not \"+\", \"-\", \"?\", or \".\""
            + "\nOffending line: " + line + "\nOffending value: " +
            tabSplit[6])
    else:
        strand = tabSplit[6]

    try:
        frame = int(tabSplit[7])
        if frame > 2 or frame < 0:
            raise dxpy.AppError(
                "The frame indicated for an element was not \".\", \"0\", \"1\", or \"2\""
                + "\nOffending line: " + line + "\nOffending value: " +
                tabSplit[7])
    except ValueError:
        if tabSplit[7] == ".":
            frame = -1
        else:
            raise dxpy.AppError(
                "The frame indicated for an element was not \".\", \"0\", \"1\", or \"2\""
                + "\nOffending line: " + line + "\nOffending value: " +
                tabSplit[7])

    lineAttributes = {}
    ##Extract the attributes from the file
    if len(tabSplit) >= 9:
        reg = re.findall("([^=]*)=([^;]*);", tabSplit[8].strip() + ";")
        for x in reg:
            if len(x[0]) < 100:
                lineAttributes[x[0]] = x[1].strip().strip("\"")
    else:
        lineAttributes = {}
    values = {
        "chromosome": chromosome,
        "lo": lo,
        "hi": hi,
        "source": source,
        "type": typ,
        "strand": strand,
        "score": score,
        "frame": frame,
        "attributes": lineAttributes
    }
    return values
Esempio n. 29
0
def validate_line(line):
    line_str = "\t".join(line)
    entries = list(line)
    
    if len(entries) > 1:
        try:
            if int(entries[1]) < 0:
                raise dxpy.AppError("The start position for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[1]))
        except ValueError:
            raise dxpy.AppError("One of the start values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[1]))
    
    if len(entries) > 2:    
        try:
            if int(entries[2]) < 0:
                raise dxpy.AppError("The end position for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[2]))
        except ValueError:
            raise dxpy.AppError("One of the end values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[2]))
        
    if len(entries) > 4:    
        try:
            if entries[4] != "." and entries[4] != "-":
                float(entries[4])
        except ValueError:
            raise dxpy.AppError("One of the score values for one entry could not be translated to a number. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[4]))
        
    if len(entries) > 5:
        if entries[5] != "+" and entries[5] != "-" and entries[5] != ".":
            raise dxpy.AppError("The strand indicated for an element was not \"+\", \"-\", or \".\"" + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[5]))
    
    if len(entries) > 6:
        try:
            if entries[6] != "." and entries[6] != "-":
                if int(entries[6]) < 0:
                    raise dxpy.AppError("The thickStart position for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[6]))
        except ValueError:
            raise dxpy.AppError("One of the thickStart values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[6]))
    
    if len(entries) > 7:    
        try:
            if entries[7] != "." and entries[7] != "-":
                if int(entries[7]) < 0:
                    raise dxpy.AppError("The thickEnd position for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[7]))
        except ValueError:
            raise dxpy.AppError("One of the thickEnd values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[7]))
    
    if len(entries) > 9:
        try:
            if int(entries[9]) < 0:
                raise dxpy.AppError("The number of exons (blockCount) for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[9]))
        except ValueError:
            raise dxpy.AppError("One of the thickEnd values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[9]))
    
    if len(entries) > 10:    
        try:
            entries[10] = entries[10].rstrip(",").split(",")
            blockStarts = [int(entries[10][n]) for n in range(int(entries[9]))]
        except:
            raise dxpy.AppError("Could not parse the blockSizes entry as a comma-separated list of integers \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[10]))
        
    if len(entries) > 11:
        try:
            entries[11] = entries[11].rstrip(",").split(",")
            blockStarts = [int(entries[11][n]) for n in range(int(entries[9]))]
        except:
            raise dxpy.AppError("Could not parse the blockStarts entry as a comma-separated list of integers \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[11]))
Esempio n. 30
0
def constructTable(inputFileName):
    inputFile = open(inputFileName, 'r')
    attributes = {}
    for line in inputFile:
        if line[0] != "#":
            line = line.strip().split("#")[0]
            tabSplit = line.split("\t")
            if len(tabSplit) == 1:
                tabSplit = line.split(" ")
                if len(tabSplit) < 9:
                    raise dxpy.AppError(
                        "One row did not have 8 or 9 entries, it had 1 instead. Offending line: "
                        + line)
                tabSplit[8] = " ".join(tabSplit[8:])
                tabSplit = tabSplit[:9]

            if len(tabSplit) != 8 and len(tabSplit) != 9:
                raise dxpy.AppError(
                    "One row did not have 8 or 9 entries, it had " +
                    str(len(tabSplit)) + " instead. Offending line: " + line)
            elif len(tabSplit) == 9:
                reg = re.findall("([^=]*)=([^;]*);", tabSplit[8].strip() + ";")
                for x in reg:
                    attributes[x[0]] = True

    reservedColumns = [
        "", "chr", "lo", "hi", "name", "span_id", "type", "score", "is_coding",
        "parent_id", "frame", "description", "source"
    ]

    #Construct table
    schema = [{
        "name": "chr",
        "type": "string"
    }, {
        "name": "lo",
        "type": "uint32"
    }, {
        "name": "hi",
        "type": "uint32"
    }, {
        "name": "name",
        "type": "string"
    }, {
        "name": "span_id",
        "type": "int32"
    }, {
        "name": "type",
        "type": "string"
    }, {
        "name": "strand",
        "type": "string"
    }, {
        "name": "score",
        "type": "float"
    }, {
        "name": "is_coding",
        "type": "boolean"
    }, {
        "name": "parent_id",
        "type": "int32"
    }, {
        "name": "frame",
        "type": "int16"
    }, {
        "name": "description",
        "type": "string"
    }, {
        "name": "source",
        "type": "string"
    }]

    additionalColumns = []
    for k, v in attributes.iteritems():
        if k not in reservedColumns and len(k) < 100:
            schema.append({"name": k, "type": "string"})
            additionalColumns.append(k)

    indices = [
        dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", 'gri'),
        dxpy.DXGTable.lexicographic_index([
            dxpy.DXGTable.lexicographic_index_column("name", True, False),
            dxpy.DXGTable.lexicographic_index_column("chr"),
            dxpy.DXGTable.lexicographic_index_column("lo"),
            dxpy.DXGTable.lexicographic_index_column("hi"),
            dxpy.DXGTable.lexicographic_index_column("type")
        ], "search")
    ]
    spansTable = dxpy.new_dxgtable(columns=schema, indices=indices)
    return spansTable, additionalColumns