def test_table_context_manager_error_handling(self): # In each case, the flush that happens at the close of the context handler should wait for # the asynchronous requests and then raise the resulting error. # Note that this test assumes that the error is a semantic error in the add_row data that # is NOT caught by any local error checking. # Use new_dxgtable with self.assertRaises(DXAPIError): with dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as table1: table1.add_row(["", 68719476736]) # Not in int32 range # Use open_dxgtable and close table table2_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], mode='w').get_id() with self.assertRaises(DXAPIError): with dxpy.open_dxgtable(table2_id) as table2: table2.add_row(["", 68719476736]) # Not in int32 range # TODO: why does the flush in this table's destructor fail? Nothing should be getting # flushed then... # Use open_dxgtable and leave table open table3_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) with self.assertRaises(DXAPIError): with dxpy.open_dxgtable(table3_id, mode='a') as table3: table3.add_row(["", 68719476736]) # Not in int32 range
def test_table_context_manager(self): # Writing a new_dxgtable with parts with dxpy.new_dxgtable( [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as self.dxgtable: for i in range(64): self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1) # Writing a new_dxgtable without parts with dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as table2: table2_id = table2.get_id() for i in range(64): table2.add_rows(data=[["row"+str(i), i]]) table2 = dxpy.open_dxgtable(table2_id) self.assertEqual(table2.describe()["length"], 64) table2.remove() # Writing an open_dxgtable table3_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]).get_id() with dxpy.open_dxgtable(table3_id, mode='a') as table3: for i in range(64): table3.add_rows(data=[["row"+str(i), i]]) with dxpy.open_dxgtable(table3_id, mode='w') as table3: for i in range(64): table3.add_rows(data=[["row"+str(i), i]]) table3 = dxpy.open_dxgtable(table3_id) state = table3._get_state() self.assertTrue(state in ['closing', 'closed']) table3._wait_on_close() self.assertEqual(table3.describe()["length"], 128) table3.remove()
def get_col_names(self): self.dxgtable = dxpy.new_dxgtable( [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) self.dxgtable.close(block=True) col_names = self.dxgtable.get_col_names() self.assertEqual(col_names, ["__id__", "a", "b"])
def test_genes_to_gtf_conversion(self): genes_table = dxpy.new_dxgtable([ dxpy.DXGTable.make_column_desc("type", "string"), dxpy.DXGTable.make_column_desc("span_id", "int64"), dxpy.DXGTable.make_column_desc("name", "string"), dxpy.DXGTable.make_column_desc("strand", "string"), dxpy.DXGTable.make_column_desc("is_coding", "boolean"), dxpy.DXGTable.make_column_desc("parent_id", "int64"), dxpy.DXGTable.make_column_desc("frame", "int64"), dxpy.DXGTable.make_column_desc("description", "string"), dxpy.DXGTable.make_column_desc("chr", "string"), dxpy.DXGTable.make_column_desc("lo", "int64"), dxpy.DXGTable.make_column_desc("hi", "int64") ]) genes_table.add_rows(data=[ ["transcript", 5, "mytranscript-noncoding", "+", False, -1, -1, "my test transcript", "chr1", 100, 200], ["exon", 6, "", "+", False, 5, -1, "", "chr1", 100, 200], ["gene", 54, "mygene-coding", "+", True, -1, -1, "my test gene", "chr1", 150, 200], ["transcript", 55, "mytranscript-coding", "+", True, 54, -1, "my test transcript", "chr1", 150, 200], ["CDS", 75, "", "+", True, 55, 0, "", "chr1", 150, 200] ]) genes_table.set_details({ "original_contigset": {"$dnanexus_link": self.genome_id} }) genes_table.close(block=True) self.assertEquals(run('dx-genes-to-gtf {g}'.format(g=genes_table.get_id())), self.expected_gtf)
def test_create_table(self): self.dxgtable = dxpy.new_dxgtable( [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) self.dxgtable.close(block=True) desc = self.dxgtable.describe() self.assertEqual(desc["columns"], [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")])
def constructTable(inputFileName): inputFile = open(inputFileName, 'r') attributes = {} for line in inputFile: if line[0] != "#": line = line.strip().split("#")[0] tabSplit = line.split("\t") if len(tabSplit) == 1: tabSplit = line.split(" ") if len(tabSplit) < 9: raise dxpy.AppError("One row did not have 8 or 9 entries, it had 1 instead. Offending line: " + line) tabSplit[8] = " ".join(tabSplit[8:]) tabSplit = tabSplit[:9] if len(tabSplit) != 8 and len(tabSplit) != 9: raise dxpy.AppError("One row did not have 8 or 9 entries, it had " + str(len(tabSplit)) + " instead. Offending line: " + line) elif len(tabSplit) == 9: reg = re.findall("([^=]*)=([^;]*);", tabSplit[8].strip() + ";") for x in reg: attributes[x[0]] = True reservedColumns = ["", "chr", "lo", "hi", "name", "span_id", "type", "score", "is_coding", "parent_id", "frame", "description", "source"] #Construct table schema = [ {"name": "chr", "type": "string"}, {"name": "lo", "type": "uint32"}, {"name": "hi", "type": "uint32"}, {"name": "name", "type": "string"}, {"name": "span_id", "type": "int32"}, {"name": "type", "type": "string"}, {"name": "strand", "type": "string"}, {"name": "score", "type": "float"}, {"name": "is_coding", "type": "boolean"}, {"name": "parent_id", "type": "int32"}, {"name": "frame", "type": "int16"}, {"name": "description", "type": "string"}, {"name": "source", "type": "string"}] additionalColumns = [] for k, v in attributes.iteritems(): if k not in reservedColumns and len(k) < 100: schema.append({"name": k, "type": "string"}) additionalColumns.append(k) indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi"), dxpy.DXGTable.lexicographic_index_column("type")], "search")] spansTable = dxpy.new_dxgtable(columns=schema, indices=indices) return spansTable, additionalColumns
def test_get_rows(self): self.dxgtable = dxpy.new_dxgtable( [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) for i in range(64): self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1) with self.assertRaises(DXAPIError): rows = self.dxgtable.get_rows() self.dxgtable.close(block=True) rows = self.dxgtable.get_rows()['data'] assert(len(rows) == 64)
def test_add_rows(self): self.dxgtable = dxpy.new_dxgtable( [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) self.dxgtable.add_rows(data=[], part=9999) # Wrong number of columns with self.assertRaises(ValueError): self.dxgtable.add_rows(data=[[]], part=9997) for i in range(64): self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1) self.dxgtable.close(block=True) with self.assertRaises(DXAPIError): self.dxgtable.close(block=True)
def test_lexicographic(self): lex_index = dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("a", case_sensitive=False), dxpy.DXGTable.lexicographic_index_column("b", ascending=False) ], "search") self.dxgtable = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")], indices=[lex_index]) self.dxgtable.close(block=True) desc = self.dxgtable.describe() self.assertEqual({u"name": u"search", u"type": u"lexicographic", u"columns": [{u"name": u"a", u"order": u"asc", u"caseSensitive": False}, {u"name": u"b", u"order": u"desc"}]}, desc['indices'][0])
def test_add_rows_no_index(self): self.dxgtable = dxpy.new_dxgtable( [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) for i in range(64): self.dxgtable.add_rows(data=[["row"+str(i), i]]) self.dxgtable.flush() desc = self.dxgtable.describe() self.assertEqual(len(desc["parts"]), 1) self.dxgtable.close(block=True) desc = self.dxgtable.describe() self.assertEqual(desc["length"], 64)
def main(**kwargs): columns = [dxpy.DXGTable.make_column_desc("word", "string")] # Call a subprocess and dump its output to a local file. # (Remove possessives and other bogus words from the word list) subprocess.check_call('egrep "^[a-z]+$" /usr/share/dict/american-english > words.txt', shell=True) # Parse the file we just generated into a GTable. with dxpy.new_dxgtable(columns=columns, mode="w") as output_gtable: for index, word in enumerate(open("words.txt")): output_gtable.add_row([word.strip()]) if index % 10000 == 0: print "Read word: " + word.strip() # Closing the GTable automatically commences at the conclusion of the "with" block. return {"words": dxpy.dxlink(output_gtable.get_id())}
def main(**kwargs): columns = [dxpy.DXGTable.make_column_desc("word", "string")] # Call a subprocess and dump its output to a local file. # (Remove possessives and other bogus words from the word list) subprocess.check_call('egrep "^[a-z]+$" /usr/share/dict/american-english > words.txt', shell=True) # Parse the file we just generated into a GTable. with dxpy.new_dxgtable(columns=columns, mode='w') as output_gtable: for index, word in enumerate(open("words.txt")): output_gtable.add_row([word.strip()]) if index % 10000 == 0: print "Read word: " + word.strip() # Closing the GTable automatically commences at the conclusion of the "with" block. return {'words': dxpy.dxlink(output_gtable.get_id())}
def test_mappings_to_sam_conversion(self): mappings_table = dxpy.new_dxgtable([ dxpy.DXGTable.make_column_desc("sequence", "string"), dxpy.DXGTable.make_column_desc("quality", "string"), dxpy.DXGTable.make_column_desc("name", "string"), dxpy.DXGTable.make_column_desc("status", "string"), dxpy.DXGTable.make_column_desc("chr", "string"), dxpy.DXGTable.make_column_desc("lo", "int32"), dxpy.DXGTable.make_column_desc("hi", "int32"), dxpy.DXGTable.make_column_desc("negative_strand", "boolean"), dxpy.DXGTable.make_column_desc("error_probability", "uint8"), dxpy.DXGTable.make_column_desc("qc_fail", "boolean"), dxpy.DXGTable.make_column_desc("duplicate", "boolean"), dxpy.DXGTable.make_column_desc("cigar", "string"), dxpy.DXGTable.make_column_desc("template_id", "int64"), dxpy.DXGTable.make_column_desc("read_group", "uint16"), dxpy.DXGTable.make_column_desc("sam_field_MD", "string"), dxpy.DXGTable.make_column_desc("sam_field_XN", "int32") ]) mappings_table.add_rows(data=[[ "TAATAAGGTTGTTGTTGTTGTT", "1:1ADDDACFHA?HGFGIIE+<", "FOO.12345678", "PRIMARY", "1", 54932368, 54932390, False, 60, False, False, "7M1D93M", 289090731, 0, "1A5^A93", -2147483648 ]], part=1) mappings_table.set_details({ "read_groups": [ {"num_singles": 1, "num_pairs": 0} ], "original_contigset": {"$dnanexus_link": self.genome_id} }) mappings_table.close(block=True) self.assertEquals(run('dx-mappings-to-sam {g}'.format(g=mappings_table.get_id())), self.expected_sam)
def import_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags): num_cols = find_num_columns(bed_file) if num_cols < 3: raise dxpy.AppError("BED file contains less than the minimum 3 columns. Invalid BED file.") columns = [("chr", "string"), ("lo", "int32"), ("hi", "int32")] column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns] gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi") with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=[gri_index], mode='w') as span: span_table_id = span.get_id() details = {"original_contigset": dxpy.dxlink(ref_id)} if file_id != None: details["original_file"] = dxpy.dxlink(file_id) if len(property_keys) != len(property_values): raise dxpy.AppError("Expected each provided property to have a corresponding value.") for i in range(len(property_keys)): details[property_keys[i]] = property_values[i] span.set_details(details) span.add_types(["Spans","gri"]) span.rename(table_name) for line in bed: if line.startswith("track"): details = span.get_details() details['track'] = line span.set_details(details) continue line = line.rstrip("\n") line = line.split() if len(line) == 0: break if len(line) < 3: raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns. Invalid BED file.") line[1] = int(line[1]) line[2] = int(line[2]) span.add_row(line) return dxpy.dxlink(span.get_id())
def test_add_rows_bad_data(self): self.dxgtable = dxpy.new_dxgtable([ dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "float"), dxpy.DXGTable.make_column_desc("c", "int32"), dxpy.DXGTable.make_column_desc("d", "boolean"), ]) # Wrong column types with self.assertRaises(ValueError): self.dxgtable.add_rows(data=[[303, 1.248, 123, True]], part=1) # Bad column 0 with self.assertRaises(ValueError): self.dxgtable.add_rows(data=[["303", "1.248", 123, True]], part=2) # Bad column 1 with self.assertRaises(ValueError): self.dxgtable.add_rows(data=[["303", 1.248, 123.5, True]], part=3) # Bad column 2 with self.assertRaises(ValueError): self.dxgtable.add_rows(data=[["303", 1.248, 123, "True"]], part=4) # Bad column 3 # Correct column types self.dxgtable.add_rows(data=[[u"303", 1.248, 123, True]], part=5) self.dxgtable.close(block=True)
def test_iter_table(self): self.dxgtable = dxpy.new_dxgtable( [dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) for i in range(64): self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1) self.dxgtable.close(block=True) counter = 0 for row in self.dxgtable: self.assertEqual(row[2], counter) counter += 1 self.assertEqual(counter, 64) counter = 0 for row in self.dxgtable.iterate_rows(start=1): self.assertEqual(row[2], counter+1) counter += 1 self.assertEqual(counter, 63) counter = 0 for row in self.dxgtable.iterate_rows(end=2): self.assertEqual(row[2], counter) counter += 1 self.assertEqual(counter, 2) counter = 0 for row in self.dxgtable.iterate_rows(start=1, end=63): self.assertEqual(row[2], counter+1) counter += 1 self.assertEqual(counter, 62) counter = 0 for row in self.dxgtable.iterate_rows(columns=['a'], start=1, end=63): counter += 1 self.assertEqual(counter, 62)
def main(**kwargs): if len(kwargs) == 0: args = parser.parse_args(sys.argv[1:]) else: args = parser.parse_args(kwargs) try: process_dataobject_args(args) except Exception as details: parser.exit(1, unicode(details) + "\n") try: process_single_dataobject_output_args(args) except Exception as details: parser.exit(1, unicode(details) + "\n") if args.output is None: project = dxpy.WORKSPACE_ID folder = get_env_var("DX_CLI_WD", u"/") if args.filename != "-": name = os.path.basename(args.filename) else: name = None else: project, folder, name = resolve_path(args.output) if name is None and args.filename != "-": name = os.path.basename(args.filename) args.indices = [] if args.indices is None else json.loads(args.indices) if args.gri is not None: args.indices.append(dxpy.DXGTable.genomic_range_index(args.gri[0], args.gri[1], args.gri[2])) args.types = ["gri"] if args.types is None else args.types + ["gri"] if args.filename == "-": fd = sys.stdin else: try: fd = open(args.filename, "rb") except: parser.exit(1, fill(unicode("Could not open " + args.filename + " for reading")) + "\n") firstrow = fd.readline() if args.csv: delimiter = "," dialect = "excel" else: delimiter = "\t" dialect = "excel" # else: # # Try to sniff the file format # dialect = csv.Sniffer().sniff(firstrow) # delimiter = dialect.delimiter firstrow_reader = csv.reader([firstrow], dialect=dialect, delimiter=delimiter) firstrow_data = firstrow_reader.next() reader = csv.reader(fd, dialect=dialect, delimiter=delimiter) column_specs = [] types = [] if args.columns is not None: specs = split_unescaped(",", args.columns) else: specs = firstrow_data for spec in specs: if ":" in spec: col_type = spec[spec.find(":") + 1 :] column_specs.append({"name": spec[: spec.find(":")], "type": col_type}) if "int" in col_type: types.append("int") elif col_type == "boolean": types.append("boolean") elif col_type in ["float", "double"]: types.append("float") elif col_type == "string": types.append("string") else: parser.exit(1, "Unrecognized column type: " + col_type + "\n") else: column_specs.append({"name": spec, "type": "string"}) types.append("string") try: dxgtable = dxpy.new_dxgtable( project=project, name=name, tags=args.tags, types=args.types, hidden=args.hidden, properties=args.properties, details=args.details, folder=folder, parents=args.parents, columns=column_specs, indices=args.indices, ) if args.columns is not None: dxgtable.add_row([parse_item(firstrow_data[i], types[i]) for i in range(len(types))]) for row in reader: dxgtable.add_row([parse_item(row[i], types[i]) for i in range(len(types))]) dxgtable.close(block=args.wait) if args.brief: print(dxgtable.get_id()) else: print_desc(dxgtable.describe(incl_properties=True, incl_details=True)) except Exception as details: parser.exit(1, fill(unicode(details)) + "\n")
def import_reads(job_input): global args if job_input == None: temp = vars(parser.parse_args(sys.argv[1:])) for key in temp: if temp[key] != None: if key == 'tags': args[key] = temp[key].split(",") # remove whitespace around tags for i in range(len(args[key])): args[key][i] = args[key][i].rstrip().lstrip() elif key == 'properties': try: args[key] = ast.literal_eval(temp[key]) except SyntaxError: raise dxpy.AppError("Cannot parse properties: " + temp[key]) else: args[key] = temp[key] else: args = job_input print(args) if 'file2' in args: paired = True else: paired = False is_fasta, is_colorspace, qual_encoding = sniff_fastq(args["file"]) if is_fasta == False and ('qual' in args or 'qual2' in args): raise dxpy.AppError( "Qualities supplied twice: FASTQ format file found along with separate quality file." ) if is_fasta and 'qual' not in args: reads_have_qualities = False else: reads_have_qualities = True table_columns = [] if not args['discard_names']: table_columns.append(("name", "string")) if paired: table_columns.append(("name2", "string")) table_columns.append(("sequence", "string")) if paired: table_columns.append(("sequence2", "string")) if reads_have_qualities and not args['discard_qualities']: table_columns.append(("quality", "string")) if paired: table_columns.append(("quality2", "string")) column_descriptors = [ dxpy.DXGTable.make_column_desc(name, type) for name, type in table_columns ] logging.info("Constructed table schema: %s" % column_descriptors) readsTable = dxpy.new_dxgtable(column_descriptors) if is_colorspace: readsTable.add_types(['ColorReads', 'Reads']) details = readsTable.get_details() details['sequence_type'] = "color" readsTable.set_details(details) else: readsTable.add_types(['LetterReads', 'Reads']) if 'tags' in args: readsTable.add_tags(args['tags']) if 'properties' in args: readsTable.set_properties(args['properties']) if paired: details = readsTable.get_details() details['paired'] = True # TODO implement estimate paired read distance # otherwise take the values they give if 'pair_orientation' in args: details['pair_orientation'] = args['pair_orientation'] if 'pair_min_dist' in args: details['pair_min_dist'] = args['pair_min_dist'] if 'pair_max_dist' in args: details['pair_max_dist'] = args['pair_max_dist'] if 'pair_avg_dist' in args: details['pair_avg_dist'] = args['pair_avg_dist'] if 'pair_std_dev_dist' in args: details['pair_std_dev_dist'] = args['pair_std_dev_dist'] readsTable.set_details(details) # generate translation table for enforcing string syntax to_replace = ''.join([".", "-"]) N = ''.join(['N'] * len(to_replace)) transtable = string.maketrans(to_replace, N) for name1, seq1, qual1, name2, seq2, qual2 in iterate_reads( fastqa1_filename=args["file"], fastqa2_filename=args["file2"] if 'file2' in args else None, qual1_filename=args["qual"] if 'qual' in args else None, qual2_filename=args["qual2"] if 'qual2' in args else None, is_fasta=is_fasta, is_colorspace=is_colorspace, qual_encoding=qual_encoding): row = [] # add name if args['discard_names'] == False: if is_fasta and name1[0] == '>': name1 = name1[1:] elif name1[0] == '@': name1 = name1[1:] row.append(name1) if paired: if is_fasta and name2[0] == '>': name2 = name2[1:] elif name2[0] == '@': name2 = name2[1:] row.append(name2) # enforce UPPERCASE seq1 = seq1.upper() if paired: seq2 = seq2.upper() # translate bad chars into Ns if not is_colorspace: seq1 = seq1.translate(transtable) if paired: seq2 = seq2.translate(transtable) # add seq row.append(seq1) if paired: row.append(seq2) # add quals if reads_have_qualities and not args['discard_qualities']: row.append(qual1) if paired: row.append(qual2) readsTable.add_row(row) # print out table ID print(json.dumps({'table_id': readsTable.get_id()})) if 'name' in args: tableName = args['name'] else: tableName = remove_file_type(args['file']) + " reads" readsTable.rename(tableName) # set link to original FASTQ file object details = readsTable.get_details() if 'file_link' in args: details['original_files'] = [args['file_link']] if 'file2' in args: details['original_files'].append(args['file2_link']) if 'qual' in args: details['original_files'].append(args['qual_link']) if 'file2' in args: assert ('qual2' in args) details['original_files'].append(args['qual2_link']) readsTable.set_details(details) readsTable.close() # place table in output return {'reads': dxpy.dxlink(readsTable.get_id())}
def main(**job_inputs): job_outputs = {} mappingsTable = dxpy.open_dxgtable(job_inputs["mappings"]["$dnanexus_link"]) mappingsTableId = mappingsTable.get_id() # This controls the degree of parallelism chunks = int(mappingsTable.describe()["length"] / job_inputs["reads_per_job"]) + 1 try: contigSetId = mappingsTable.get_details()["original_contigset"]["$dnanexus_link"] originalContigSet = mappingsTable.get_details()["original_contigset"] except: raise Exception("The original reference genome must be attached as a detail") # In the next major section of code, we construct a variants table. As regions of the genome are passed to each worker # and variants are called on them, the workers will add rows to this table concurrently. variants_schema = [ {"name": "chr", "type": "string"}, {"name": "lo", "type": "int32"}, {"name": "hi", "type": "int32"}, {"name": "ref", "type": "string"}, {"name": "alt", "type": "string"}, {"name": "qual", "type": "double"}, {"name": "ids", "type": "string"}, ] # The information in these tags is elevated into specific columns, so additional columns for these tags will not be created elevatedTags = ["format_GT", "format_DP", "format_AD"] # The info and format tags are extracted from the header printed by samtools # If additional code will add a tag to the output of the program, modify this header to include the tag. # TODO: Allow the table to be created by the first job that finishes to avoid this step. headerInfo = extractHeader("/tmp/header.txt", elevatedTags) description = {} samples = [] indices = [dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")] ##The following section creates the sample-specific table columns for k, v in headerInfo["tags"]["info"].iteritems(): variants_schema.append({"name": "info_" + k, "type": translateTagTypeToColumnType(v)}) description[k] = {"name": k, "description": v["description"], "type": v["type"], "number": v["number"]} # For each sample, add the sample-specific columns to the schema, at present only one sample is supported numSamples = 1 for i in range(numSamples): variants_schema.extend( [ {"name": "genotype_" + str(i), "type": "string"}, {"name": "phasing_" + str(i), "type": "string"}, {"name": "type_" + str(i), "type": "string"}, {"name": "variation_qual_" + str(i), "type": "double"}, {"name": "genotype_qual_" + str(i), "type": "double"}, {"name": "coverage_" + str(i), "type": "string"}, {"name": "total_coverage_" + str(i), "type": "int32"}, ] ) indices.append(dxpy.DXGTable.lexicographic_index([["type_" + str(i), "ASC"]], "type_" + str(i))) samples.append("Sample_0") for k, v in headerInfo["tags"]["format"].iteritems(): if "format_" + k not in elevatedTags: variants_schema.append({"name": "format_" + k + "_" + str(i), "type": translateTagTypeToColumnType(v)}) # TODO: Add lexicographic indices when secondary indices are supported variants = dxpy.new_dxgtable(variants_schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")]) tableId = variants.get_id() variants = dxpy.open_dxgtable(tableId) variants.add_types(["Variants", "gri"]) details = { "samples": samples, "original_contigset": job_inputs["reference"], "original_mappings": job_inputs["mappings"], "formats": headerInfo["tags"]["format"], "infos": headerInfo["tags"]["info"], } # if headerInfo.get('filters') != {}: # details['filters'] = headerInfo['filters'] variants.set_details(details) if "output_name" in job_inputs: variants.rename(job_inputs["output_name"]) else: variants.rename(mappingsTable.describe()["name"] + " variant calls by Samtools mpileup") # Split the genome into evenly sized regions genomeRegions = splitGenomeLengthLargePieces(originalContigSet, chunks) # Generate the command line arguments needed to run samtools and bcftools samOptions = makeSamtoolsParameters(**job_inputs) bcfOptions = makeBcftoolsParameters(**job_inputs) # The rest of the main function contains the map-reduce functionality. For each genome chunk, an input spec is created for a new child job. # Which specifies reduce_job_inputs = {} for i in range(len(genomeRegions)): if len(genomeRegions[i]) > 0: map_job_inputs = { "mappings_table_id": mappingsTableId, "original_contig_set": contigSetId, "interval": genomeRegions[i], "tableId": tableId, "compress_reference": job_inputs["compress_reference"], "compress_no_call": job_inputs["compress_no_call"], "infer_no_call": job_inputs["infer_no_call"], "sam_options": samOptions, "bcf_options": bcfOptions, "part_number": i, } # Run a "map" job for each chunk, passing in the inputspec from above and looking for a function entry point given as "map" (@dxpy.entry_point('map')) map_job = dxpy.new_dxjob(map_job_inputs, "map") reduce_job_inputs["mapJob" + str(i) + "TableId"] = {"job": map_job.get_id(), "field": "ok"} reduce_job_inputs["tableId"] = tableId # Run a "reduce" job, which only begins once all of the map jobs singal they have completed by sending 'ok':True # The reduce job closes the table. This step is explicitly needed because table closing must wait till the completion of the map jobs # By giving the reduce job the map jobs as input, the reduce job will wait to start. reduce_job = dxpy.new_dxjob(reduce_job_inputs, "reduce") job_outputs = {"variants": {"job": reduce_job.get_id(), "field": "variants"}} return job_outputs
def import_reads(job_input): global args if job_input == None: temp = vars(parser.parse_args(sys.argv[1:])) for key in temp: if temp[key] != None: if key == 'tags': args[key] = temp[key].split(",") # remove whitespace around tags for i in range(len(args[key])): args[key][i] = args[key][i].rstrip().lstrip() elif key == 'properties': try: args[key] = ast.literal_eval(temp[key]) except SyntaxError: raise dxpy.AppError("Cannot parse properties: " + temp[key]) else: args[key] = temp[key] else: args = job_input print(args) if 'file2' in args: paired = True else: paired = False is_fasta, is_colorspace, qual_encoding = sniff_fastq(args["file"]) if is_fasta == False and ('qual' in args or 'qual2' in args): raise dxpy.AppError("Qualities supplied twice: FASTQ format file found along with separate quality file.") if is_fasta and 'qual' not in args: reads_have_qualities = False else: reads_have_qualities = True table_columns = [] if not args['discard_names']: table_columns.append(("name", "string")) if paired: table_columns.append(("name2", "string")) table_columns.append(("sequence", "string")) if paired: table_columns.append(("sequence2", "string")) if reads_have_qualities and not args['discard_qualities']: table_columns.append(("quality", "string")) if paired: table_columns.append(("quality2", "string")) column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in table_columns] logging.info("Constructed table schema: %s" % column_descriptors) readsTable = dxpy.new_dxgtable(column_descriptors) if is_colorspace: readsTable.add_types(['ColorReads', 'Reads']) details = readsTable.get_details() details['sequence_type'] = "color" readsTable.set_details(details) else: readsTable.add_types(['LetterReads', 'Reads']) if 'tags' in args: readsTable.add_tags(args['tags']) if 'properties' in args: readsTable.set_properties(args['properties']) if paired: details = readsTable.get_details() details['paired'] = True # TODO implement estimate paired read distance # otherwise take the values they give if 'pair_orientation' in args: details['pair_orientation'] = args['pair_orientation'] if 'pair_min_dist' in args: details['pair_min_dist'] = args['pair_min_dist'] if 'pair_max_dist' in args: details['pair_max_dist'] = args['pair_max_dist'] if 'pair_avg_dist' in args: details['pair_avg_dist'] = args['pair_avg_dist'] if 'pair_std_dev_dist' in args: details['pair_std_dev_dist'] = args['pair_std_dev_dist'] readsTable.set_details(details) # generate translation table for enforcing string syntax to_replace=''.join([".","-"]) N=''.join(['N'] * len(to_replace)) transtable = string.maketrans( to_replace, N ) for name1, seq1, qual1, name2, seq2, qual2 in iterate_reads(fastqa1_filename=args["file"], fastqa2_filename=args["file2"] if 'file2' in args else None, qual1_filename=args["qual"] if 'qual' in args else None, qual2_filename=args["qual2"] if 'qual2' in args else None, is_fasta=is_fasta, is_colorspace=is_colorspace, qual_encoding=qual_encoding): row = [] # add name if args['discard_names'] == False: if is_fasta and name1[0] == '>': name1 = name1[1:] elif name1[0] == '@': name1 = name1[1:] row.append(name1) if paired: if is_fasta and name2[0] == '>': name2 = name2[1:] elif name2[0] == '@': name2 = name2[1:] row.append(name2) # enforce UPPERCASE seq1 = seq1.upper() if paired: seq2 = seq2.upper() # translate bad chars into Ns if not is_colorspace: seq1 = seq1.translate(transtable) if paired: seq2 = seq2.translate(transtable) # add seq row.append(seq1) if paired: row.append(seq2) # add quals if reads_have_qualities and not args['discard_qualities']: row.append(qual1) if paired: row.append(qual2) readsTable.add_row(row) # print out table ID print(json.dumps({'table_id': readsTable.get_id()})) if 'name' in args: tableName = args['name'] else: tableName = remove_file_type(args['file']) + " reads" readsTable.rename(tableName) # set link to original FASTQ file object details = readsTable.get_details() if 'file_link' in args: details['original_files'] = [ args['file_link'] ] if 'file2' in args: details['original_files'].append(args['file2_link']) if 'qual' in args: details['original_files'].append(args['qual_link']) if 'file2' in args: assert('qual2' in args) details['original_files'].append(args['qual2_link']) readsTable.set_details(details) readsTable.close() # place table in output return {'reads': dxpy.dxlink(readsTable.get_id())}
def main(**job_inputs): job_outputs = {} header = '' print "Downloading input VCF file" inputFile = dxpy.download_dxfile(job_inputs['vcf'], 'output.file') decompressFile('output.file') print "Constructing table schema" elevatedTags = ['format_GT', 'format_DP', 'format_AD'] headerInfo = extractHeader('output.vcf', elevatedTags) variants_schema = [ {"name": "chr", "type": "string"}, {"name": "lo", "type": "int32"}, {"name": "hi", "type": "int32"}, {"name": "ref", "type": "string"}, {"name": "alt", "type": "string"}, {"name": "qual", "type": "double"}, {"name": "ids", "type": "string"} ] description = {} samples = [] if headerInfo.get('filters') != {}: variants_schema.append({"name": "filter", "type": "string"}) indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')] formats = {} infos = {} filters = {} for k, v in headerInfo['tags']['info'].iteritems(): variants_schema.append({"name": "info_"+k, "type":translateTagTypeToColumnType(v)}) description[k] = {'name' : k, 'description' : v['description'], 'type' : v['type'], 'number' : v['number']} numSamples = len(headerInfo['columns'].strip().split("\t")[9:]) if numSamples > 10: raise dxpy.AppError("The VCF file contained too many samples, can't import a VCF containing more than 10 samples") if job_inputs['searchable_ids']: indices.append(dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("ids", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi")], "search")) #For each sample, write the sample-specific columns for i in range(len(headerInfo['columns'].strip().split("\t")[9:])): #This prevents name collision in columns variants_schema.extend([ {"name": "genotype_"+str(i), "type": "string"}, {"name": "phasing_"+str(i), "type": "string"}, {"name": "type_"+str(i), "type": "string"}, {"name": "variation_qual_"+str(i), "type": "double"}, {"name": "genotype_qual_"+str(i), "type": "double"}, {"name": "coverage_"+str(i), "type": "string"}, {"name": "total_coverage_"+str(i), "type": "int32"} ]) #indices.append(dxpy.DXGTable.lexicographic_index([["type_"+str(i), "ASC"]], 'type_'+str(i))) samples.append(headerInfo['columns'].strip().split("\t")[9:][i]) for k, v in headerInfo['tags']['format'].iteritems(): if "format_"+k not in elevatedTags: variants_schema.append({"name": "format_"+k+"_"+str(i), "type":translateTagTypeToColumnType(v)}) if 'output_name' in job_inputs: name = job_inputs['output_name'] else: fileName = dxpy.DXFile(job_inputs['vcf']['$dnanexus_link']).describe()['name'] name = fileName.split(".")[0] for x in fileName.split(".")[1:-1]: name += "."+x details = {'samples':samples, 'original_contigset':job_inputs['reference'], 'original_file':job_inputs['vcf'], 'formats':headerInfo['tags']['format'], 'infos':headerInfo['tags']['info'], 'alts':headerInfo['tags']['alt']} if headerInfo.get('filters') != {}: details['filters'] = headerInfo['filters'] table = dxpy.new_dxgtable(variants_schema, indices=indices) table.set_details(details) types = ["Variants", "gri"] if 'additional_types' in job_inputs: for x in job_inputs['additional_types'].split(","): if x != '': types.append(x) table.add_types(types) if 'tags' in job_inputs: table.add_tags(job_inputs['tags']) if 'properties' in job_inputs: table.set_properties(job_inputs['properties']) table.rename(name) command = "dx_vcfToVariants2" command += " --table_id " + str(table.get_id()) command += " --vcf_file output.vcf" if job_inputs['compress_reference']: command += " --compress_reference" if job_inputs['infer_no_call']: command += " --infer_no_call" if job_inputs['compress_no_call']: command += " --compress_no_call" command += " --encoding "+job_inputs["file_encoding"] print "Importing variants by running:", command try: subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as e: try: errorData = open("AppError.txt", 'r').read() raise dxpy.AppError(errorData) except IOError: raise dxpy.AppError("An unknown error occurred. Please check the log file") attach_empty_trackspec(table) table.close() result = dxpy.dxlink(table.get_id()) job_outputs['variants'] = result return job_outputs
def main(BAM, reference, mb_per_chunk): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. #BAM = dxpy.DXFile(BAM) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. #dxpy.download_dxfile(BAM.get_id(), "input.bam") # Split your work into parallel tasks. As an example, the # following generates 10 subjobs running with the same dummy # input. chunks = int(dxpy.DXFile(BAM).describe()['size']/(1000000*mb_per_chunk)) if chunks < 1: chunks = 1 #subprocess.check_call("samtools view input.bam -H -o header.txt", shell=True) #chromosomes = re.findall("SN:([^\t]*)", line.strip()) schema = [ {"name": "sequence", "type":"string"}, {"name": "chr", "type": "string"}, {"name": "lo", "type": "int32"}, {"name": "hi", "type": "int32"}, {"name": "negative_strand", "type": "boolean"}, {"name": "cigar", "type": "string"} ] mappingsTable = dxpy.new_dxgtable(schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")]) mappingsTable.add_types(["Mappings"]) mappingsTable.set_details({"original_contigset":dxpy.dxlink(reference)}) subjobs = [] for i in range(chunks): #subprocess.check_call("samtools view -b input.bam -F 4 -o subset.bam %s" % (" ".join(chromosomes[i::chunks])), shell=True) #jobFile = dxpy.upload_local_file("subset.bam").get_id() subjob_input = { "tableId": mappingsTable.get_id(), "BAM": BAM, "job_id": i, "chunks": chunks} subjobs.append(dxpy.new_dxjob(subjob_input, 'process')) # The following line creates the job that will perform the # "postprocess" step of your app. If you give it any inputs that # use outputs from the "process" jobs, then it will automatically # wait for those jobs to finish before it starts running. If you # do not need to give it any such inputs, you can explicitly state # the dependencies to wait for those jobs to finish by setting the # "depends_on" field to the list of subjobs to wait for (it # accepts either DXJob objects are string job IDs in the list). postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("output") for subjob in subjobs] }, fn_name='postprocess', depends_on=subjobs) # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a reference. If the output field in the postprocess # function is called "answer", you can pass that on here as # follows: # # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...} output = {'mappings': dxpy.dxlink(mappingsTable.get_id())} return output
def import_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, isBedDetail, delimiter="\t"): num_cols = find_num_columns(bed_file, delimiter) # if this is a bedDetail file we should treat the last two columns separately if isBedDetail: num_cols -= 2 possible_columns = [("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("name", "string"), ("score", "float"), ("strand", "string"), ("thick_start", "int32"), ("thick_end", "int32"), ("item_rgb", "string")] bedDetail_columns = [("bedDetail_ID", "string"), ("bedDetail_desc", "string")] possible_default_row = ["", 0, 0, "", 0, ".", 0, 0, ""] columns = possible_columns[:num_cols] if isBedDetail: columns.extend(bedDetail_columns) if num_cols > len(columns): for i in range(len(columns), num_cols): columns.append(("BED_column_"+str(i+1), "string")) possible_default_row.append("") default_row = possible_default_row[:num_cols] if isBedDetail: default_row.extend(["",""]) column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns] indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')] for c in columns: if "name" in c: indices.append(dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi")], "search")) break with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span: details = {"original_contigset": dxpy.dxlink(ref_id)} if file_id != None: details["original_file"] = dxpy.dxlink(file_id) if len(property_keys) != len(property_values): raise dxpy.AppError("Expected each provided property to have a corresponding value.") for i in range(len(property_keys)): details[property_keys[i]] = property_values[i] span.set_details(details) span.add_types(["Spans", "gri"]) span.rename(table_name) for line in bed: row = list(default_row) if line.startswith("track"): details = span.get_details() details['track'] = line span.set_details(details) continue line = line.rstrip("\n") line = line.split(delimiter) if isBedDetail: # only the first 4 columns are guaranteed to be defined by UCSC validate_line(line[:4]) # save last two fields separately bedDetailFields = line[-2:] line = line[:-2] else: validate_line(line[:num_cols]) # check to see if this is a weird line if len(line) == 0: break if len(line) < 3: raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns. Invalid BED file.") try: row[0] = line[0] row[1] = int(line[1]) row[2] = int(line[2]) row[3] = line[3] # dashes are sometimes used when field is invalid if line[4] == "-" or line[4] == ".": line[4] = 0 row[4] = float(line[4]) row[5] = line[5] # dashes are sometimes used when field is invalid if line[6] == "-" or line[6] == ".": line[6] = 0 row[6] = int(line[6]) # dashes are sometimes used when field is invalid if line[7] == "-" or line[7] == ".": line[7] = 0 row[7] = int(line[7]) row[8] = line[8] # an index error would come from having fewer columns in a row, which we should handle ok except IndexError: pass # value error when fields are messed up and string gets converted to int, etc. Throw these out. except ValueError: continue if isBedDetail: # add these in at the end if we have a bedDetail file row[num_cols] = bedDetailFields[0] row[num_cols+1] = bedDetailFields[1] span.add_row(row) span.flush() return dxpy.dxlink(span.get_id())
def main(DX_APP_WIZARD_INPUT_SIGNATURE): DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES # First, create the output GTable that will contain your results. # NOTE: You must specify the columns and indices for a GTable when # you create it, and they are immutable thereafter. # # Note: If you are filtering a GTable or are otherwise happy with # using the same exact columns and indices as your input GTable, # you can easily initialize your new GTable as follows: # # DX_APP_WIZARD_||_OUTPUT = dxpy.new_dxgtable(init_from=DX_APP_WIZARD_||_INPUT) # # In the more general case, you may want to specify different # columns. The following lines assume you would like to create a # GTable with a genomic range index, i.e. there is a string column # for chromosome names and two integer columns for low and high # coordinates. columns = [dxpy.DXGTable.make_column_desc("chr", "string"), dxpy.DXGTable.make_column_desc("lo", "int"), dxpy.DXGTable.make_column_desc("hi", "int"), dxpy.DXGTable.make_column_desc("somedata", "string")] DX_APP_WIZARD_||_OUTPUT = dxpy.new_dxgtable(columns=columns, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")]) # Split your input to be solved by the next stage of your app. # The following assumes you are splitting the input by giving # 100000 rows of a GenomicTable per subjob running the # "process" entry point. num_rows = DX_APP_WIZARD_||_INPUT.describe()["length"] subjobs = [] for i in range(num_rows / row_chunk_size + (0 if num_rows % row_chunk_size == 0 else 1)): subjob_input = { "input_gtable_id": DX_APP_WIZARD_||_INPUT.get_id(), "start_row": row_chunk_size * i, "end_row": min(row_chunk_size * (i + 1), num_rows), "output_gtable_id": DX_APP_WIZARD_||_OUTPUT.get_id()} subjobs.append(dxpy.new_dxjob(subjob_input, "process")) # The next line creates the job that will perform the # "postprocess" step of your app. It assumes that you do not need # to aggregate any output from your "process" stages (other than # closing the output GTable), but you can add the output of those # stages to the input of your "postprocess" stage easily by adding # the following value as a field in the "fn_input" dict and adding # the parameter to your "postprocess" entry point. # # fn_input={"process_outputs": [subjob.get_output_ref("output") for subjob in subjobs], ...} # # With no other input other than the output GTable ID for the # "postprocess" stage, we will force it to run only after all the # "process" stages have finished running by providing the list of # their DXJob handlers to the "depends_on" field (it accepts # either dxpy handlers or string IDs in the list). postprocess_job = dxpy.new_dxjob(fn_input={ "output_gtable_id": DX_APP_WIZARD_||_OUTPUT.get_id() }, fn_name="postprocess", depends_on=subjobs) # If you would like to include any of the output fields from the # postprocess_job as the output of your app, you should return it # here using a job-based object reference. If the output field is # called "answer", you can pass that on here as follows: # # return {"app_output_field": postprocess_job.get_output_ref("answer"), ...} # # Tip: you can include in your output at this point any open # objects (such as GTables) which are closed by a job that # finishes later. The system will check to make sure that the # output object is closed and will attempt to clone it out as # output into the parent container only after all subjobs have # finished. output = {}
def main(**kwargs): if len(kwargs) == 0: args = parser.parse_args(sys.argv[1:]) else: args = parser.parse_args(kwargs) try: process_dataobject_args(args) except BaseException as details: parser.exit(1, unicode(details) + '\n') if args.output is None: project = dxpy.WORKSPACE_ID folder = os.environ.get('DX_CLI_WD', '/') if args.filename != '-': name = os.path.basename(args.filename) else: name = None else: project, folder, name = resolve_path(args.output) if name is None and args.filename != '-': name = os.path.basename(args.filename) args.indices = [] if args.indices is None else json.loads(args.indices) if args.gri is not None: args.indices.append(dxpy.DXGTable.genomic_range_index(args.gri[0], args.gri[1], args.gri[2])) args.types = ['gri'] if args.types is None else args.types + ['gri'] if args.filename == '-': fd = sys.stdin else: try: fd = open(args.filename, 'rb') except: parser.exit(1, fill(unicode('Could not open ' + args.filename + ' for reading')) + '\n') firstrow = fd.readline() if args.csv: delimiter = ',' dialect = 'excel' else: delimiter = '\t' dialect = 'excel' # else: # # Try to sniff the file format # dialect = csv.Sniffer().sniff(firstrow) # delimiter = dialect.delimiter firstrow_reader = csv.reader([firstrow], dialect=dialect, delimiter=delimiter) firstrow_data = firstrow_reader.next() reader = csv.reader(fd, dialect=dialect, delimiter=delimiter) column_specs = [] types = [] if args.columns is not None: specs = split_unescaped(',', args.columns) else: specs = firstrow_data for spec in specs: if ':' in spec: col_type = spec[spec.find(':') + 1:] column_specs.append({'name': spec[:spec.find(':')], 'type': col_type}) if 'int' in col_type: types.append('int') elif col_type == 'boolean': types.append('boolean') elif col_type in ['float', 'double']: types.append('float') elif col_type == 'string': types.append('string') else: parser.exit(1, 'Unrecognized column type: ' + col_type + '\n') else: column_specs.append({'name': spec, 'type': 'string'}) types.append('string') try: dxgtable = dxpy.new_dxgtable(project=project, name=name, tags=args.tags, types=args.types, hidden=args.hidden, properties=args.properties, details=args.details, folder=folder, parents=args.parents, columns=column_specs, indices=args.indices) if args.columns is not None: dxgtable.add_row([ parse_item(firstrow_data[i], types[i]) for i in range(len(types))]) for row in reader: dxgtable.add_row([ parse_item(row[i], types[i]) for i in range(len(types))]) dxgtable.close(block=args.wait) if args.brief: print dxgtable.get_id() else: print_desc(dxgtable.describe(incl_properties=True, incl_details=True)) except BaseException as details: parser.exit(1, fill(unicode(details)) + '\n')
def main(**kwargs): if len(kwargs) == 0: args = parser.parse_args(sys.argv[1:]) else: args = parser.parse_args(kwargs) try: process_dataobject_args(args) except Exception as details: parser.exit(1, unicode(details) + '\n') try: process_single_dataobject_output_args(args) except Exception as details: parser.exit(1, unicode(details) + '\n') if args.output is None: project = dxpy.WORKSPACE_ID folder = dxpy.config.get('DX_CLI_WD', u'/') if args.filename != '-': name = os.path.basename(args.filename) else: name = None else: project, folder, name = resolve_path(args.output) if name is None and args.filename != '-': name = os.path.basename(args.filename) args.indices = [] if args.indices is None else json.loads(args.indices) if args.gri is not None: args.indices.append( dxpy.DXGTable.genomic_range_index(args.gri[0], args.gri[1], args.gri[2])) args.types = ['gri'] if args.types is None else args.types + ['gri'] if args.filename == '-': fd = sys.stdin else: try: fd = open(args.filename, 'rb') except: parser.exit( 1, fill( unicode('Could not open ' + args.filename + ' for reading')) + '\n') firstrow = fd.readline() if args.csv: delimiter = ',' dialect = 'excel' else: delimiter = '\t' dialect = 'excel' # else: # # Try to sniff the file format # dialect = csv.Sniffer().sniff(firstrow) # delimiter = dialect.delimiter firstrow_reader = csv.reader([firstrow], dialect=dialect, delimiter=delimiter) firstrow_data = firstrow_reader.next() reader = csv.reader(fd, dialect=dialect, delimiter=delimiter) column_specs = [] types = [] if args.columns is not None: specs = split_unescaped(',', args.columns) else: specs = firstrow_data for spec in specs: if ':' in spec: col_type = spec[spec.find(':') + 1:] column_specs.append({ 'name': spec[:spec.find(':')], 'type': col_type }) if 'int' in col_type: types.append('int') elif col_type == 'boolean': types.append('boolean') elif col_type in ['float', 'double']: types.append('float') elif col_type == 'string': types.append('string') else: parser.exit(1, 'Unrecognized column type: ' + col_type + '\n') else: column_specs.append({'name': spec, 'type': 'string'}) types.append('string') try: dxgtable = dxpy.new_dxgtable(project=project, name=name, tags=args.tags, types=args.types, hidden=args.hidden, properties=args.properties, details=args.details, folder=folder, parents=args.parents, columns=column_specs, indices=args.indices) if args.columns is not None: dxgtable.add_row([ parse_item(firstrow_data[i], types[i]) for i in range(len(types)) ]) for row in reader: dxgtable.add_row( [parse_item(row[i], types[i]) for i in range(len(types))]) dxgtable.close(block=args.wait) if args.brief: print(dxgtable.get_id()) else: print_desc( dxgtable.describe(incl_properties=True, incl_details=True)) except Exception as details: parser.exit(1, fill(unicode(details)) + '\n')
def main(**job_inputs): job_outputs = {} reads_inputs = job_inputs['reads'] reads_ids = [r['$dnanexus_link'] for r in reads_inputs] reads_descriptions = {r: dxpy.DXGTable(r).describe() for r in reads_ids} reads_columns = {r: [col['name'] for col in desc['columns']] for r, desc in reads_descriptions.items()} print reads_inputs print reads_ids print reads_descriptions print reads_columns all_reads_have_FlowReads_tag = all(['FlowReads' in desc['types'] for desc in reads_descriptions.values()]) all_reads_have_LetterReads_tag = all(['LetterReads' in desc['types'] for desc in reads_descriptions.values()]) reads_have_names = any(['name' in columns for columns in reads_columns.values()]) reads_are_paired = any(['sequence2' in columns for columns in reads_columns.values()]) reads_have_qualities = any(['quality' in columns for columns in reads_columns.values()]) if reads_have_qualities: assert(all(['quality' in columns for columns in reads_columns.values()])) if reads_are_paired: all_paired = all(['sequence2' in columns for columns in reads_columns.values()]) if not all_paired: raise dxpy.AppError("Reads to be mapped must be either all paired or all unpaired. App input contains both paired and unpaired reads.") if job_inputs["algorithm"] == "bwasw": assert(not reads_are_paired) # bwasw does not support paired inputs assert(all_reads_have_FlowReads_tag or all_reads_have_LetterReads_tag) reference_record_types = dxpy.describe(job_inputs['reference'])['types'] if "BwaLetterContigSetV3" in reference_record_types: input_ref_is_indexed = True elif "ContigSet" in reference_record_types: input_ref_is_indexed = False else: raise dxpy.ProgramError("Unrecognized object passed as reference. It must be a ContigSet record or a BwaLetterContigSetV3 file") if input_ref_is_indexed: job_outputs['indexed_reference'] = job_inputs['reference'] else: found_cached_idx = False for result in dxpy.find_data_objects(classname='record', typename='BwaLetterContigSetV3', link=job_inputs['reference']['$dnanexus_link']): job_outputs['indexed_reference'] = dxpy.dxlink(result['id']) found_cached_idx = True break if not found_cached_idx: job_outputs['indexed_reference'] = dxpy.dxlink(make_indexed_reference(job_inputs)) table_columns = [("sequence", "string")] if reads_have_names: table_columns.append(("name", "string")) if reads_have_qualities: table_columns.append(("quality", "string")) table_columns.extend([("status", "string"), ("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("negative_strand", "boolean"), ("error_probability", "uint8"), ("qc_fail", "boolean"), ("duplicate", "boolean"), ("cigar", "string"), ("template_id", "int64"), ("read_group", "int32")]) # optional sam fields: RG BC XC XT NM CM XN SM AM XM X0 X1 XG MD XA if reads_are_paired: table_columns.extend([("mate_id", "int32"), # TODO: int8 ("status2", "string"), ("chr2", "string"), ("lo2", "int32"), ("hi2", "int32"), ("negative_strand2", "boolean"), ("proper_pair", "boolean")]) if all_reads_have_FlowReads_tag: table_columns.extend([("flowgram", "string"), ("flow_indices", "string"), ("clip_qual_left", "int32"), ("clip_qual_right", "int32"), ("clip_adapter_left", "int32"), ("clip_adapter_right", "int32")]) table_columns.extend([("sam_field_BC", "string"), ("sam_field_XC", "int32"), ("sam_field_XT", "string"), ("sam_field_NM", "int32"), ("sam_field_CM", "int32"), ("sam_field_XN", "int32"), ("sam_field_SM", "int32"), ("sam_field_AM", "int32"), ("sam_field_XM", "int32"), ("sam_field_X0", "int32"), ("sam_field_X1", "int32"), ("sam_field_XG", "int32"), ("sam_field_MD", "string"), ("sam_field_XA", "string"), ("sam_optional_fields", "string")]) column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in table_columns] gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi") t = dxpy.new_dxgtable(column_descriptors, indices=[gri_index]) if input_ref_is_indexed: original_contigset = dxpy.get_details(job_inputs['reference'])['original_contigset'] else: original_contigset = job_inputs['reference'] t.set_details({'original_contigset': original_contigset}) t.add_types(["LetterMappings", "Mappings", "gri"]) # name table if 'output_name' in job_inputs: t.rename(job_inputs['output_name']) else: first_reads_name = dxpy.DXGTable( job_inputs['reads'][0] ).describe()['name'] contig_set_name = dxpy.describe(job_inputs['reference'])['name'] # if we're working on an indexed_reference we're not guaranteed to have access to original_contigset if input_ref_is_indexed: contig_set_name = contig_set_name.split(' (index')[0] t.rename(first_reads_name + " mapped to " + contig_set_name) # declare how many paired or single reads are in each reads table read_group_lengths = [] for i in range(len(reads_ids)): current_length = reads_descriptions[reads_ids[i]]["length"] if 'sequence2' in dxpy.DXGTable(reads_ids[i]).get_col_names(): num_pairs = current_length num_singles = 0 else: num_pairs = 0 num_singles = current_length read_group_lengths.append( {"num_singles":num_singles, "num_pairs":num_pairs} ) details = t.get_details() details['read_groups'] = read_group_lengths t.set_details(details) row_offsets = []; row_cursor = 0 for i in range(len(reads_ids)): row_offsets.append(row_cursor) row_cursor += reads_descriptions[reads_ids[i]]["length"] chunk_size = job_inputs["chunk_size"] map_job_inputs = job_inputs.copy() map_job_inputs["row_offsets"] = row_offsets map_job_inputs["num_rows"] = chunk_size map_job_inputs["table_id"] = t.get_id() map_job_inputs["indexed_reference"] = job_outputs['indexed_reference'] postprocess_job_inputs = job_inputs.copy() postprocess_job_inputs["table_id"] = t.get_id() for start_row in xrange(0, row_cursor, chunk_size): map_job_inputs["start_row"] = start_row map_job = dxpy.new_dxjob(map_job_inputs, "map") print "Launched map job with", map_job_inputs postprocess_job_inputs["chunk%dresult" % start_row] = {'job': map_job.get_id(), 'field': 'ok'} postprocess_job_inputs["chunk%ddebug" % start_row] = {'job': map_job.get_id(), 'field': 'debug'} postprocess_job = dxpy.new_dxjob(postprocess_job_inputs, "postprocess") job_outputs['mappings'] = {'job': postprocess_job.get_id(), 'field': 'mappings'} print "MAIN OUTPUT:", job_outputs return job_outputs
def test_gri(self): data10 = [['chr2', 22, 28, 'j'], ['chr1', 0, 3, 'a'], ['chr1', 5, 8, 'b'], ['chr1', 25, 30, 'i'], ['chr1', 6, 10, 'c'], ['chr1', 19, 20, 'h'], ['chr1', 8, 9, 'd'], ['chr1', 17, 19, 'g'], ['chr1', 15, 23, 'e'], ['chr1', 16, 21, 'f']]; columns = [{ "name": 'foo', "type": 'string' }, { "name": 'bar', "type": 'int32' }, { "name": 'baz', "type": 'int32' }, { "name": 'quux', "type": 'string' }]; genomic_index = dxpy.DXGTable.genomic_range_index('foo', 'bar', 'baz') self.assertEqual(genomic_index, {"name": "gri", "type": "genomic", "chr": "foo", "lo": "bar", "hi": "baz"}) dxgtable = dxpy.new_dxgtable(columns, indices=[genomic_index]) desc = dxgtable.describe() self.assertEqual(desc["indices"], [genomic_index]); dxgtable.add_rows(data10[:3], 1) dxgtable.add_rows(data10[3:6], 10) dxgtable.add_rows(data10[6:9], 100) dxgtable.add_rows(data10[9:], 1000) dxgtable.close(True) desc = dxgtable.describe() self.assertEqual(desc["length"], 10) # Offset + limit queries result = dxgtable.get_rows(starting=0, limit=1); self.assertEqual(result["data"], [[0, 'chr1', 0, 3, 'a']]); self.assertEqual(result["next"], 1); self.assertEqual(result["length"], 1); result = dxgtable.get_rows(starting=4, limit=3); self.assertEqual(result["data"], [[4, 'chr1', 15, 23, 'e'], [5, 'chr1', 16, 21, 'f'], [6, 'chr1', 17, 19, 'g']]); self.assertEqual(result["next"], 7); self.assertEqual(result["length"], 3); # Range query genomic_query = dxpy.DXGTable.genomic_range_query('chr1', 22, 25) result = dxgtable.get_rows(query=genomic_query) self.assertEqual(result["data"], [[4, 'chr1', 15, 23, 'e']]); self.assertEqual(result["next"], None); self.assertEqual(result["length"], 1); # Range query with nonconsecutive rows in result genomic_query = dxpy.DXGTable.genomic_range_query('chr1', 20, 26) result = dxgtable.get_rows(query=genomic_query) self.assertEqual(result["data"], [[4, 'chr1', 15, 23, 'e'], [5, 'chr1', 16, 21, 'f'], [8, 'chr1', 25, 30, 'i']]); self.assertEqual(result["next"], None); self.assertEqual(result["length"], 3); # Testing iterate_rows row_num = 5 for row in dxgtable.iterate_rows(5, 8): self.assertEqual(row_num, row[0]) row_num += 1 self.assertEqual(row_num, 8) # Testing iterate_query_rows genomic_query = dxpy.DXGTable.genomic_range_query('chr1', 20, 26) result_num = 0 for row in dxgtable.iterate_query_rows(genomic_query): if result_num == 0: self.assertEqual(4, row[0]) elif result_num == 1: self.assertEqual(5, row[0]) elif result_num == 2: self.assertEqual(8, row[0]) result_num += 1 self.assertEqual(3, result_num)
def constructTable(inputFileName): inputFile = open(inputFileName, 'r') attributes = {} for line in inputFile: if line[0] != "#": line = line.strip().split("#")[0] tabSplit = line.split("\t") if len(tabSplit) == 1: tabSplit = line.split(" ") if len(tabSplit) < 9: raise dxpy.AppError( "One row did not have 8 or 9 entries, it had 1 instead. Offending line: " + line) tabSplit[8] = " ".join(tabSplit[8:]) tabSplit = tabSplit[:9] if len(tabSplit) != 8 and len(tabSplit) != 9: raise dxpy.AppError( "One row did not have 8 or 9 entries, it had " + str(len(tabSplit)) + " instead. Offending line: " + line) elif len(tabSplit) == 9: reg = re.findall("([^=]*)=([^;]*);", tabSplit[8].strip() + ";") for x in reg: attributes[x[0]] = True reservedColumns = [ "", "chr", "lo", "hi", "name", "span_id", "type", "score", "is_coding", "parent_id", "frame", "description", "source" ] #Construct table schema = [{ "name": "chr", "type": "string" }, { "name": "lo", "type": "uint32" }, { "name": "hi", "type": "uint32" }, { "name": "name", "type": "string" }, { "name": "span_id", "type": "int32" }, { "name": "type", "type": "string" }, { "name": "strand", "type": "string" }, { "name": "score", "type": "float" }, { "name": "is_coding", "type": "boolean" }, { "name": "parent_id", "type": "int32" }, { "name": "frame", "type": "int16" }, { "name": "description", "type": "string" }, { "name": "source", "type": "string" }] additionalColumns = [] for k, v in attributes.iteritems(): if k not in reservedColumns and len(k) < 100: schema.append({"name": k, "type": "string"}) additionalColumns.append(k) indices = [ dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", 'gri'), dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi"), dxpy.DXGTable.lexicographic_index_column("type") ], "search") ] spansTable = dxpy.new_dxgtable(columns=schema, indices=indices) return spansTable, additionalColumns
def test_var_initialization(self): ''' This test assumes a well-formed input spec and mostly just tests that everything compiles and the variable initialization code does not throw any errors. ''' print("Setting current project to", self.project) dxpy.WORKSPACE_ID = self.project dxpy.PROJECT_CONTEXT_ID = self.project # Make some data objects for input dxapplet = dxpy.api.applet_new({ "project": dxpy.WORKSPACE_ID, "name": "anapplet", "dxapi": "1.0.0", "runSpec": { "code": "", "interpreter": "bash" } })['id'] dxfile = dxpy.upload_string("foo", name="afile") dxgtable = dxpy.new_dxgtable(columns=[{ "name": "int_col", "type": "int" }], name="agtable") dxgtable.add_rows([[3], [0]]) dxgtable.close(block=True) dxrecord = dxpy.new_dxrecord(name="arecord") dxrecord.close() dxapp_json = { "name": "all_vars", "title": "all_vars", "summary": "all_vars", "dxapi": "1.0.0", "version": "0.0.1", "categories": [], "inputSpec": [], "outputSpec": [] } classes = [ 'applet', 'record', 'file', 'gtable', 'boolean', 'int', 'float', 'string', 'hash', 'array:applet', 'array:record', 'array:file', 'array:gtable', 'array:boolean', 'array:int', 'array:float', 'array:string' ] for classname in classes: dxapp_json['inputSpec'].append({ "name": "required_" + classname.replace(":", "_"), "class": classname, "optional": False }) # Note: marking outputs as optional so that empty arrays # will be acceptable; keeping names the same (as required) # in order to allow pass-through from input variables dxapp_json['outputSpec'].append({ "name": "required_" + classname.replace(":", "_"), "class": classname, "optional": True }) dxapp_json['inputSpec'].append({ "name": "optional_" + classname.replace(":", "_"), "class": classname, "optional": True }) cmdline_args = [ '-irequired_applet=anapplet', '-irequired_array_applet=anapplet', '-irequired_record=arecord', '-irequired_array_record=arecord', '-irequired_file=afile', '-irequired_array_file=afile', '-irequired_gtable=agtable', '-irequired_array_gtable=agtable', '-irequired_boolean=true', '-irequired_array_boolean=true', '-irequired_int=32', '-irequired_array_int=42', '-irequired_float=3.4', '-irequired_array_float=.42', '-irequired_string=foo', '-irequired_array_string=bar', '-irequired_hash={"foo":"bar"}' ] for lang in supported_languages: appdir = create_app_dir_with_dxapp_json(dxapp_json, lang) # Test with bare-minimum of inputs output = subprocess.check_output(['dx-run-app-locally', appdir] + cmdline_args) print(output) self.assertIn("App finished successfully", output) # See PTFM-13697 for CentOS 5 details if testutil.TEST_RUN_JOBS and not testutil.host_is_centos_5(): # Now actually make it an applet and run it applet_name = dxapp_json['name'] + '-' + lang subprocess.check_output( ['dx', 'build', appdir, '--destination', applet_name]) subprocess.check_output( ['dx', 'run', applet_name, '-y', '--wait'] + cmdline_args)
def import_genes(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, delimiter="\t"): # implement BED importing from this format: # http://genome.ucsc.edu/FAQ/FAQformat.html#format1 columns = [("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("name", "string"), ("span_id", "int32"), ("type", "string"), ("strand", "string"), ("is_coding", "boolean"), ("parent_id", "int32"), ("frame", "int16"), ("description", "string")] column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns] indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi"), dxpy.DXGTable.lexicographic_index_column("type")], "search")] default_row = ["", 0, 0, "", -1, "", ".", False, -1, -1, ""] with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span: span_table_id = span.get_id() details = {"original_contigset": dxpy.dxlink(ref_id)} if file_id != None: details["original_file"] = dxpy.dxlink(file_id) if len(property_keys) != len(property_values): raise dxpy.AppError("Expected each provided property to have a corresponding value.") for i in range(len(property_keys)): details[property_keys[i]] = property_values[i] span.set_details(details) span.add_types(["gri", "Genes"]) span.rename(table_name) current_span_id = 0 # where the parsing magic happens for line in bed: if line.startswith("track"): details = span.get_details() details['track'] = line span.set_details(details) continue line = line.rstrip("\n") row = list(default_row) line = line.split(delimiter) validate_line(line) if len(line) < 12: raise dxpy.AppError("Line: "+"\t".join(line)+" in gene model-like BED file contains less than 12 columns. Invalid BED file.") # add parent gene track row = generate_gene_row(line, 0, 0, "transcript", default_row, -1, current_span_id) if row != None: span.add_row(row) current_parent_id = current_span_id current_span_id += 1 # add all children blockCount = int(line[9]) line[10] = line[10].rstrip(",").split(",") blockSizes = [int(line[10][n]) for n in range(blockCount)] line[11] = line[11].rstrip(",").split(",") blockStarts = [int(line[11][n]) for n in range(blockCount)] gene_lo = int(line[1]) gene_hi = int(line[2]) # set thick* to be within the gene if outside thickStart = min(max(int(line[6]), gene_lo), gene_hi) thickEnd = max(min(int(line[7]), gene_hi), gene_lo) for i in range(blockCount): # look to thickStart and thickEnd to get information about the type of this region # if thick* are the same or cover the whole transcript then we ignore them # else, we partition the exons into CDS and UTR based on their boundaries if thickStart == thickEnd or (thickStart == gene_lo and thickEnd == gene_hi): span.add_row(generate_gene_row(line, blockSizes[i], blockStarts[i], "exon", default_row, current_parent_id, current_span_id)) current_span_id += 1 else: exon_lo = int(line[1])+blockStarts[i] exon_hi = int(exon_lo+blockSizes[i]) # we're all UTR if we enter either of these if (exon_hi <= thickStart and line[5] == '+') or (exon_lo >= thickEnd and line[5] == '-'): span.add_row(generate_gene_row(line, blockSizes[i], blockStarts[i], "5' UTR", default_row, current_parent_id, current_span_id)) current_span_id += 1 elif (exon_hi <= thickStart and line[5] == '-') or (exon_lo >= thickEnd and line[5] == '+'): span.add_row(generate_gene_row(line, blockSizes[i], blockStarts[i], "3' UTR", default_row, current_parent_id, current_span_id)) current_span_id += 1 # if this is true then we overlap CDS partially or completely elif (exon_lo < thickEnd and exon_hi > thickStart): # entirely contained if exon_lo >= thickStart and exon_hi <= thickEnd: span.add_row(generate_gene_row(line, blockSizes[i], blockStarts[i], "CDS", default_row, current_parent_id, current_span_id)) current_span_id += 1 else: # left portion is UTR if exon_lo < thickStart: if line[5] == '+': UTR_type = "5' UTR" else: UTR_type = "3' UTR" UTR_size = (min(blockSizes[i], thickStart - exon_lo)) span.add_row(generate_gene_row(line, UTR_size, blockStarts[i], UTR_type, default_row, current_parent_id, current_span_id)) current_span_id += 1 # CDS portion CDS_size = blockSizes[i] - (max(exon_lo, thickStart) - exon_lo) CDS_size -= (exon_hi - min(exon_hi, thickEnd)) CDS_start = (max(exon_lo, thickStart) - exon_lo) + blockStarts[i] span.add_row(generate_gene_row(line, CDS_size, CDS_start, "CDS", default_row, current_parent_id, current_span_id)) current_span_id += 1 # right portion is UTR if exon_hi > thickEnd: if line[5] == '+': UTR_type = "3' UTR" else: UTR_type = "5' UTR" UTR_size = (min(blockSizes[i], exon_hi - thickEnd)) UTR_start = blockStarts[i] + thickEnd - exon_lo span.add_row(generate_gene_row(line, UTR_size, UTR_start, UTR_type, default_row, current_parent_id, current_span_id)) current_span_id += 1 return dxpy.dxlink(span.get_id())
def import_named_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags): num_cols = find_num_columns(bed_file) possible_columns = [("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("name", "string"), ("score", "float"), ("strand", "string"), ("thick_start", "int32"), ("thick_end", "int32"), ("item_rgb", "string")] possible_default_row = ["", 0, 0, "", 0, ".", 0, 0, ""] columns = possible_columns[:num_cols] if num_cols > len(columns): for i in range(len(columns), num_cols): columns.append(("BED_column_"+str(i+1), "string")) possible_default_row.append("") default_row = possible_default_row[:num_cols] column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns] gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi") name_index = dxpy.DXGTable.lexicographic_index([["name", "ASC"]], "name") with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=[gri_index, name_index], mode='w') as span: details = {"original_contigset": dxpy.dxlink(ref_id)} if file_id != None: details["original_file"] = dxpy.dxlink(file_id) if len(property_keys) != len(property_values): raise dxpy.AppError("Expected each provided property to have a corresponding value.") for i in range(len(property_keys)): details[property_keys[i]] = property_values[i] span.set_details(details) span.add_types(["Spans", "gri"]) span.rename(table_name) for line in bed: row = list(default_row) if line.startswith("track"): details = span.get_details() details['track'] = line span.set_details(details) continue line = line.rstrip("\n") line = line.split() # check to see if this is a weird line if len(line) == 0: break if len(line) < 3: raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns. Invalid BED file.") try: row[0] = line[0] row[1] = int(line[1]) row[2] = int(line[2]) row[3] = line[3] # dashes are sometimes used when field is invalid if line[4] == "-": line[4] = 0 row[4] = int(line[4]) row[5] = line[5] # dashes are sometimes used when field is invalid if line[6] == "-": line[6] = 0 row[6] = int(line[6]) # dashes are sometimes used when field is invalid if line[7] == "-": line[7] = 0 row[7] = int(line[7]) row[8] = line[8] # an index error would come from having fewer columns in a row, which we should handle ok except IndexError: pass # value error when fields are messed up and string gets converted to int, etc. Throw these out. except ValueError: continue span.add_row(row) return dxpy.dxlink(span.get_id())
def test_table_context_manager_destructor(self): dxgtable = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "int32")]) for i in range(64): dxgtable.add_rows(data=[["row"+str(i), i]])
def test_create_table_with_invalid_spec(self): with self.assertRaises(DXAPIError): dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"), dxpy.DXGTable.make_column_desc("b", "muffins")])
def constructTable(inputFileName): inputFile = open(inputFileName, 'r') attributes = {"gene_id" : True, "transcript_id": True} for line in inputFile: if line[0] != "#": tabSplit = line.split("\t") if len(tabSplit) == 1: tabSplit = line.split(" ") if len(tabSplit) < 9: raise dxpy.AppError("One row did not have 9 entries, it had 1 instead. Offending line: " + line) tabSplit[8] = " ".join(tabSplit[8:]) tabSplit = tabSplit[:9] if len(tabSplit) != 9: raise dxpy.AppError("One row did not have 9 entries, it had " + str(len(tabSplit)) + " instead. Offending line: " + line) else: entrySplit = tabSplit[8].split(";") geneIdPresent = False transcriptIdPresent = False result = [] for x in entrySplit: keyValue = x.strip().split(" ") key = keyValue[0] if key == "gene_id": geneIdPresent = True elif key == "transcript_id": transcriptIdPresent = True attributes[key] = True if not geneIdPresent: raise dxpy.AppError("One row did not have a gene_id Offending line: " + line) if not transcriptIdPresent: raise dxpy.AppError("One row did not have a gene_id Offending line: " + line) #Construct table schema = [ {"name": "chr", "type": "string"}, {"name": "lo", "type": "uint32"}, {"name": "hi", "type": "uint32"}, {"name": "name", "type": "string"}, {"name": "span_id", "type": "int32"}, {"name": "type", "type": "string"}, {"name": "strand", "type": "string"}, {"name": "score", "type": "float"}, {"name": "is_coding", "type": "boolean"}, {"name": "parent_id", "type": "int32"}, {"name": "frame", "type": "int16"}, {"name": "description", "type": "string"}, {"name": "source", "type": "string"}, {"name": "gene_id", "type": "string"}, {"name": "transcript_id", "type": "string"}] additionalColumns = ['gene_id', 'transcript_id'] for k, v in attributes.iteritems(): if k != '' and k != 'gene_id' and k != 'transcript_id' and len(k) < 100: schema.append({"name": k, "type": "string"}) additionalColumns.append(k) indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi"), dxpy.DXGTable.lexicographic_index_column("type")], "search")] spansTable = dxpy.new_dxgtable(columns=schema, indices=indices) return spansTable, additionalColumns
def constructTable(inputFileName): inputFile = open(inputFileName, 'r') attributes = {"gene_id": True, "transcript_id": True} for line in inputFile: if line[0] != "#": tabSplit = line.split("\t") if len(tabSplit) == 1: tabSplit = line.split(" ") if len(tabSplit) < 9: raise dxpy.AppError( "One row did not have 9 entries, it had 1 instead. Offending line: " + line) tabSplit[8] = " ".join(tabSplit[8:]) tabSplit = tabSplit[:9] if len(tabSplit) != 9: raise dxpy.AppError("One row did not have 9 entries, it had " + str(len(tabSplit)) + " instead. Offending line: " + line) else: entrySplit = tabSplit[8].split(";") geneIdPresent = False transcriptIdPresent = False result = [] for x in entrySplit: keyValue = x.strip().split(" ") key = keyValue[0] if key == "gene_id": geneIdPresent = True elif key == "transcript_id": transcriptIdPresent = True attributes[key] = True if not geneIdPresent: raise dxpy.AppError( "One row did not have a gene_id Offending line: " + line) if not transcriptIdPresent: raise dxpy.AppError( "One row did not have a gene_id Offending line: " + line) #Construct table schema = [{ "name": "chr", "type": "string" }, { "name": "lo", "type": "uint32" }, { "name": "hi", "type": "uint32" }, { "name": "name", "type": "string" }, { "name": "span_id", "type": "int32" }, { "name": "type", "type": "string" }, { "name": "strand", "type": "string" }, { "name": "score", "type": "float" }, { "name": "is_coding", "type": "boolean" }, { "name": "parent_id", "type": "int32" }, { "name": "frame", "type": "int16" }, { "name": "description", "type": "string" }, { "name": "source", "type": "string" }, { "name": "gene_id", "type": "string" }, { "name": "transcript_id", "type": "string" }] additionalColumns = ['gene_id', 'transcript_id'] for k, v in attributes.iteritems(): if k != '' and k != 'gene_id' and k != 'transcript_id' and len( k) < 100: schema.append({"name": k, "type": "string"}) additionalColumns.append(k) indices = [ dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", 'gri'), dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi"), dxpy.DXGTable.lexicographic_index_column("type") ], "search") ] spansTable = dxpy.new_dxgtable(columns=schema, indices=indices) return spansTable, additionalColumns
def upload_transcripts_file( trans_file, sample_name ): with open(trans_file, 'r') as fh: # eat column header line line = fh.readline().rstrip('\n') line = line.split('\t') trans_schema = [("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("tracking_id", "string"), ("class_code", "string"), ("nearest_ref_id", "string"), ("gene_id", "string"), ("gene_short_name", "string"), ("tss_id", "string"), ("length", "int32"), ("coverage", "float"), ("FPKM", "float"), ("FPKM_lo", "float"), ("FPKM_hi", "float"), ("status", "string")] column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in trans_schema] gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi") transcripts = dxpy.new_dxgtable(column_descriptors, indices=[gri_index]) transcripts.rename(sample_name+"_FPKM_per_gene") while True: line = fh.readline() line = line.rstrip('\n') if line == '': break line = line.split('\t') try: chrom = line[6].split(":")[0] lo = int(line[6].split(":")[1].split("-")[0]) - 1 hi = int(line[6].split(":")[1].split("-")[1]) # no length set, set to 0 if line[7] == '-': line[7] = 0 if line[8] == '-': line[8] = -1 trans_row = [chrom, lo, hi, line[0], line[1], line[2], line[3], line[4], line[5], int(line[7]), float(line[8]), float(line[9]), float(line[10]), float(line[11]), line[12]] transcripts.add_row(trans_row) except IndexError: raise dxpy.AppError("Error parsing transcript file from cufflinks. Line: "+line) transcripts.close(block = True) return transcripts
def test_var_initialization(self): ''' This test assumes a well-formed input spec and mostly just tests that everything compiles and the variable initialization code does not throw any errors. ''' print("Setting current project to", self.project) dxpy.WORKSPACE_ID = self.project dxpy.PROJECT_CONTEXT_ID = self.project # Make some data objects for input dxapplet = dxpy.api.applet_new({"project": dxpy.WORKSPACE_ID, "name": "anapplet", "dxapi": "1.0.0", "runSpec": {"code": "", "interpreter": "bash"}})['id'] dxfile = dxpy.upload_string("foo", name="afile") dxgtable = dxpy.new_dxgtable(columns=[{"name": "int_col", "type": "int"}], name="agtable") dxgtable.add_rows([[3], [0]]) dxgtable.close(block=True) dxrecord = dxpy.new_dxrecord(name="arecord") dxrecord.close() dxapp_json = { "name": "all_vars", "title": "all_vars", "summary": "all_vars", "dxapi": "1.0.0", "version": "0.0.1", "categories": [], "inputSpec": [], "outputSpec": [] } classes = ['applet', 'record', 'file', 'gtable', 'boolean', 'int', 'float', 'string', 'hash', 'array:applet', 'array:record', 'array:file', 'array:gtable', 'array:boolean', 'array:int', 'array:float', 'array:string'] for classname in classes: dxapp_json['inputSpec'].append({"name": "required_" + classname.replace(":", "_"), "class": classname, "optional": False}) # Note: marking outputs as optional so that empty arrays # will be acceptable; keeping names the same (as required) # in order to allow pass-through from input variables dxapp_json['outputSpec'].append({"name": "required_" + classname.replace(":", "_"), "class": classname, "optional": True}) dxapp_json['inputSpec'].append({"name": "optional_" + classname.replace(":", "_"), "class": classname, "optional": True}) cmdline_args = ['-irequired_applet=anapplet', '-irequired_array_applet=anapplet', '-irequired_record=arecord', '-irequired_array_record=arecord', '-irequired_file=afile', '-irequired_array_file=afile', '-irequired_gtable=agtable', '-irequired_array_gtable=agtable', '-irequired_boolean=true', '-irequired_array_boolean=true', '-irequired_array_boolean=false', '-irequired_int=32', '-irequired_array_int=42', '-irequired_float=3.4', '-irequired_array_float=.42', '-irequired_string=foo', '-irequired_array_string=bar', '-irequired_hash={"foo":"bar"}'] for lang in supported_languages: appdir = create_app_dir_with_dxapp_json(dxapp_json, lang) # Test with bare-minimum of inputs output = subprocess.check_output(['dx-run-app-locally', appdir] + cmdline_args) print(output) # Verify array is printed total 3 times once in each input, logs, and final output self.assertEquals(len(re.findall("required_array_boolean = \[ true, false ]", output)), 3) self.assertIn("App finished successfully", output) # See PTFM-13697 for CentOS 5 details if testutil.TEST_RUN_JOBS and not testutil.host_is_centos_5(): # Now actually make it an applet and run it applet_name = dxapp_json['name'] + '-' + lang subprocess.check_output(['dx', 'build', appdir, '--destination', applet_name]) subprocess.check_output(['dx', 'run', applet_name, '-y', '--wait'] + cmdline_args)
def upload_transcripts_file(trans_file, sample_name): with open(trans_file, 'r') as fh: # eat column header line line = fh.readline().rstrip('\n') line = line.split('\t') trans_schema = [("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("tracking_id", "string"), ("class_code", "string"), ("nearest_ref_id", "string"), ("gene_id", "string"), ("gene_short_name", "string"), ("tss_id", "string"), ("length", "int32"), ("coverage", "float"), ("FPKM", "float"), ("FPKM_lo", "float"), ("FPKM_hi", "float"), ("status", "string")] column_descriptors = [ dxpy.DXGTable.make_column_desc(name, type) for name, type in trans_schema ] gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi") transcripts = dxpy.new_dxgtable(column_descriptors, indices=[gri_index]) transcripts.rename(sample_name + "_FPKM_per_gene") while True: line = fh.readline() line = line.rstrip('\n') if line == '': break line = line.split('\t') try: chrom = line[6].split(":")[0] lo = int(line[6].split(":")[1].split("-")[0]) - 1 hi = int(line[6].split(":")[1].split("-")[1]) # no length set, set to 0 if line[7] == '-': line[7] = 0 if line[8] == '-': line[8] = -1 trans_row = [ chrom, lo, hi, line[0], line[1], line[2], line[3], line[4], line[5], int(line[7]), float(line[8]), float(line[9]), float(line[10]), float(line[11]), line[12] ] transcripts.add_row(trans_row) except IndexError: raise dxpy.AppError( "Error parsing transcript file from cufflinks. Line: " + line) transcripts.close(block=True) return transcripts