Exemple #1
0
    def test_table_context_manager_error_handling(self):
        # In each case, the flush that happens at the close of the context handler should wait for
        # the asynchronous requests and then raise the resulting error.

        # Note that this test assumes that the error is a semantic error in the add_row data that
        # is NOT caught by any local error checking.

        # Use new_dxgtable
        with self.assertRaises(DXAPIError):
            with dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                    dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as table1:
                table1.add_row(["", 68719476736]) # Not in int32 range

        # Use open_dxgtable and close table
        table2_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                       dxpy.DXGTable.make_column_desc("b", "int32")], mode='w').get_id()
        with self.assertRaises(DXAPIError):
            with dxpy.open_dxgtable(table2_id) as table2:
                table2.add_row(["", 68719476736]) # Not in int32 range
        # TODO: why does the flush in this table's destructor fail? Nothing should be getting
        # flushed then...

        # Use open_dxgtable and leave table open
        table3_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                       dxpy.DXGTable.make_column_desc("b", "int32")])
        with self.assertRaises(DXAPIError):
            with dxpy.open_dxgtable(table3_id, mode='a') as table3:
                table3.add_row(["", 68719476736]) # Not in int32 range
Exemple #2
0
    def test_table_context_manager(self):
        # Writing a new_dxgtable with parts
        with dxpy.new_dxgtable(
            [dxpy.DXGTable.make_column_desc("a", "string"),
             dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as self.dxgtable:
            for i in range(64):
                self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1)

        # Writing a new_dxgtable without parts
        with dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                dxpy.DXGTable.make_column_desc("b", "int32")], mode='w') as table2:
            table2_id = table2.get_id()
            for i in range(64):
                table2.add_rows(data=[["row"+str(i), i]])
        table2 = dxpy.open_dxgtable(table2_id)
        self.assertEqual(table2.describe()["length"], 64)
        table2.remove()

        # Writing an open_dxgtable
        table3_id = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                       dxpy.DXGTable.make_column_desc("b", "int32")]).get_id()
        with dxpy.open_dxgtable(table3_id, mode='a') as table3:
            for i in range(64):
                table3.add_rows(data=[["row"+str(i), i]])
        with dxpy.open_dxgtable(table3_id, mode='w') as table3:
            for i in range(64):
                table3.add_rows(data=[["row"+str(i), i]])
        table3 = dxpy.open_dxgtable(table3_id)
        state = table3._get_state()
        self.assertTrue(state in ['closing', 'closed'])
        table3._wait_on_close()
        self.assertEqual(table3.describe()["length"], 128)
        table3.remove()
Exemple #3
0
 def get_col_names(self):
     self.dxgtable = dxpy.new_dxgtable(
         [dxpy.DXGTable.make_column_desc("a", "string"),
          dxpy.DXGTable.make_column_desc("b", "int32")])
     self.dxgtable.close(block=True)
     col_names = self.dxgtable.get_col_names()
     self.assertEqual(col_names, ["__id__", "a", "b"])
Exemple #4
0
    def test_genes_to_gtf_conversion(self):
        genes_table = dxpy.new_dxgtable([
            dxpy.DXGTable.make_column_desc("type", "string"),
            dxpy.DXGTable.make_column_desc("span_id", "int64"),
            dxpy.DXGTable.make_column_desc("name", "string"),
            dxpy.DXGTable.make_column_desc("strand", "string"),
            dxpy.DXGTable.make_column_desc("is_coding", "boolean"),
            dxpy.DXGTable.make_column_desc("parent_id", "int64"),
            dxpy.DXGTable.make_column_desc("frame", "int64"),
            dxpy.DXGTable.make_column_desc("description", "string"),
            dxpy.DXGTable.make_column_desc("chr", "string"),
            dxpy.DXGTable.make_column_desc("lo", "int64"),
            dxpy.DXGTable.make_column_desc("hi", "int64")
        ])
        genes_table.add_rows(data=[
            ["transcript", 5, "mytranscript-noncoding", "+", False, -1, -1, "my test transcript", "chr1", 100, 200],
            ["exon", 6, "", "+", False, 5, -1, "", "chr1", 100, 200],
            ["gene", 54, "mygene-coding", "+", True, -1, -1, "my test gene", "chr1", 150, 200],
            ["transcript", 55, "mytranscript-coding", "+", True, 54, -1, "my test transcript", "chr1", 150, 200],
            ["CDS", 75, "", "+", True, 55, 0, "", "chr1", 150, 200]
        ])
        genes_table.set_details({
            "original_contigset": {"$dnanexus_link": self.genome_id}
        })
        genes_table.close(block=True)

        self.assertEquals(run('dx-genes-to-gtf {g}'.format(g=genes_table.get_id())),
                          self.expected_gtf)
Exemple #5
0
 def test_create_table(self):
     self.dxgtable = dxpy.new_dxgtable(
         [dxpy.DXGTable.make_column_desc("a", "string"),
          dxpy.DXGTable.make_column_desc("b", "int32")])
     self.dxgtable.close(block=True)
     desc = self.dxgtable.describe()
     self.assertEqual(desc["columns"],
                      [dxpy.DXGTable.make_column_desc("a", "string"),
                       dxpy.DXGTable.make_column_desc("b", "int32")])
def constructTable(inputFileName):
    inputFile = open(inputFileName, 'r')
    attributes = {}
    for line in inputFile:
        if line[0] != "#":
            line = line.strip().split("#")[0]
            tabSplit = line.split("\t")
            if len(tabSplit) == 1:
                tabSplit = line.split(" ")
                if len(tabSplit) < 9:
                    raise dxpy.AppError("One row did not have 8 or 9 entries, it had 1 instead. Offending line: " + line)
                tabSplit[8] = " ".join(tabSplit[8:])
                tabSplit = tabSplit[:9]
            
            if len(tabSplit) != 8 and len(tabSplit) != 9:
                raise dxpy.AppError("One row did not have 8 or 9 entries, it had " + str(len(tabSplit)) + " instead. Offending line: " + line)
            elif len(tabSplit) == 9:
                reg = re.findall("([^=]*)=([^;]*);", tabSplit[8].strip() + ";")
                for x in reg:
                    attributes[x[0]] = True
    
    
    reservedColumns = ["", "chr", "lo", "hi", "name", "span_id", "type", "score", "is_coding", "parent_id", "frame", "description", "source"]
    
    #Construct table
    schema = [
            {"name": "chr", "type": "string"}, 
            {"name": "lo", "type": "uint32"},
            {"name": "hi", "type": "uint32"},
            {"name": "name", "type": "string"},
            {"name": "span_id", "type": "int32"},
            {"name": "type", "type": "string"},
            {"name": "strand", "type": "string"},
            {"name": "score", "type": "float"},
            {"name": "is_coding", "type": "boolean"},
            {"name": "parent_id", "type": "int32"},
            {"name": "frame", "type": "int16"},
            {"name": "description", "type": "string"},
            {"name": "source", "type": "string"}]
    
    additionalColumns = []
    for k, v in attributes.iteritems():
        if k not in reservedColumns and len(k) < 100:
            schema.append({"name": k, "type": "string"})
            additionalColumns.append(k)
            
    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), 
               dxpy.DXGTable.lexicographic_index([
                  dxpy.DXGTable.lexicographic_index_column("name", True, False),
                  dxpy.DXGTable.lexicographic_index_column("chr"),
                  dxpy.DXGTable.lexicographic_index_column("lo"),
                  dxpy.DXGTable.lexicographic_index_column("hi"),
                  dxpy.DXGTable.lexicographic_index_column("type")], "search")]
    spansTable = dxpy.new_dxgtable(columns=schema, indices=indices)
    return spansTable, additionalColumns
Exemple #7
0
 def test_get_rows(self):
     self.dxgtable = dxpy.new_dxgtable(
         [dxpy.DXGTable.make_column_desc("a", "string"),
          dxpy.DXGTable.make_column_desc("b", "int32")])
     for i in range(64):
         self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1)
     with self.assertRaises(DXAPIError):
         rows = self.dxgtable.get_rows()
     self.dxgtable.close(block=True)
     rows = self.dxgtable.get_rows()['data']
     assert(len(rows) == 64)
Exemple #8
0
    def test_add_rows(self):
        self.dxgtable = dxpy.new_dxgtable(
            [dxpy.DXGTable.make_column_desc("a", "string"),
             dxpy.DXGTable.make_column_desc("b", "int32")])
        self.dxgtable.add_rows(data=[], part=9999)
        # Wrong number of columns
        with self.assertRaises(ValueError):
            self.dxgtable.add_rows(data=[[]], part=9997)

        for i in range(64):
            self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1)
        self.dxgtable.close(block=True)

        with self.assertRaises(DXAPIError):
            self.dxgtable.close(block=True)
Exemple #9
0
 def test_lexicographic(self):
     lex_index = dxpy.DXGTable.lexicographic_index([
             dxpy.DXGTable.lexicographic_index_column("a", case_sensitive=False),
             dxpy.DXGTable.lexicographic_index_column("b", ascending=False)
             ], "search")
     self.dxgtable = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                        dxpy.DXGTable.make_column_desc("b", "int32")],
                                       indices=[lex_index])
     self.dxgtable.close(block=True)
     desc = self.dxgtable.describe()
     self.assertEqual({u"name": u"search",
                       u"type": u"lexicographic",
                       u"columns": [{u"name": u"a", u"order": u"asc", u"caseSensitive": False},
                                    {u"name": u"b", u"order": u"desc"}]},
                      desc['indices'][0])
Exemple #10
0
    def test_add_rows_no_index(self):
        self.dxgtable = dxpy.new_dxgtable(
            [dxpy.DXGTable.make_column_desc("a", "string"),
             dxpy.DXGTable.make_column_desc("b", "int32")])
        for i in range(64):
            self.dxgtable.add_rows(data=[["row"+str(i), i]])

        self.dxgtable.flush()
        desc = self.dxgtable.describe()
        self.assertEqual(len(desc["parts"]), 1)

        self.dxgtable.close(block=True)

        desc = self.dxgtable.describe()
        self.assertEqual(desc["length"], 64)
def main(**kwargs):
    columns = [dxpy.DXGTable.make_column_desc("word", "string")]

    # Call a subprocess and dump its output to a local file.
    # (Remove possessives and other bogus words from the word list)
    subprocess.check_call('egrep "^[a-z]+$" /usr/share/dict/american-english > words.txt', shell=True)

    # Parse the file we just generated into a GTable.
    with dxpy.new_dxgtable(columns=columns, mode="w") as output_gtable:
        for index, word in enumerate(open("words.txt")):
            output_gtable.add_row([word.strip()])
            if index % 10000 == 0:
                print "Read word: " + word.strip()
    # Closing the GTable automatically commences at the conclusion of the "with" block.

    return {"words": dxpy.dxlink(output_gtable.get_id())}
Exemple #12
0
def main(**kwargs):
    columns = [dxpy.DXGTable.make_column_desc("word", "string")]

    # Call a subprocess and dump its output to a local file.
    # (Remove possessives and other bogus words from the word list)
    subprocess.check_call('egrep "^[a-z]+$" /usr/share/dict/american-english > words.txt', shell=True)

    # Parse the file we just generated into a GTable.
    with dxpy.new_dxgtable(columns=columns, mode='w') as output_gtable:
        for index, word in enumerate(open("words.txt")):
            output_gtable.add_row([word.strip()])
            if index % 10000 == 0:
                print "Read word: " + word.strip()
    # Closing the GTable automatically commences at the conclusion of the "with" block.

    return {'words': dxpy.dxlink(output_gtable.get_id())}
Exemple #13
0
    def test_mappings_to_sam_conversion(self):
        mappings_table = dxpy.new_dxgtable([
            dxpy.DXGTable.make_column_desc("sequence", "string"),
            dxpy.DXGTable.make_column_desc("quality", "string"),
            dxpy.DXGTable.make_column_desc("name", "string"),
            dxpy.DXGTable.make_column_desc("status", "string"),
            dxpy.DXGTable.make_column_desc("chr", "string"),
            dxpy.DXGTable.make_column_desc("lo", "int32"),
            dxpy.DXGTable.make_column_desc("hi", "int32"),
            dxpy.DXGTable.make_column_desc("negative_strand", "boolean"),
            dxpy.DXGTable.make_column_desc("error_probability", "uint8"),
            dxpy.DXGTable.make_column_desc("qc_fail", "boolean"),
            dxpy.DXGTable.make_column_desc("duplicate", "boolean"),
            dxpy.DXGTable.make_column_desc("cigar", "string"),
            dxpy.DXGTable.make_column_desc("template_id", "int64"),
            dxpy.DXGTable.make_column_desc("read_group", "uint16"),
            dxpy.DXGTable.make_column_desc("sam_field_MD", "string"),
            dxpy.DXGTable.make_column_desc("sam_field_XN", "int32")
        ])
        mappings_table.add_rows(data=[[
            "TAATAAGGTTGTTGTTGTTGTT",
            "1:1ADDDACFHA?HGFGIIE+<",
            "FOO.12345678",
            "PRIMARY",
            "1",
            54932368,
            54932390,
            False,
            60,
            False,
            False,
            "7M1D93M",
            289090731,
            0,
            "1A5^A93",
            -2147483648
        ]], part=1)
        mappings_table.set_details({
            "read_groups": [
                {"num_singles": 1, "num_pairs": 0}
            ],
            "original_contigset": {"$dnanexus_link": self.genome_id}
        })
        mappings_table.close(block=True)

        self.assertEquals(run('dx-mappings-to-sam {g}'.format(g=mappings_table.get_id())),
                          self.expected_sam)
def import_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags):
    num_cols = find_num_columns(bed_file)
    if num_cols < 3:
        raise dxpy.AppError("BED file contains less than the minimum 3 columns.  Invalid BED file.")

    columns = [("chr", "string"),
               ("lo", "int32"),
               ("hi", "int32")]

    column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns]
    gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")

    with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=[gri_index], mode='w') as span:
        span_table_id = span.get_id()

        details = {"original_contigset": dxpy.dxlink(ref_id)}
        if file_id != None:
            details["original_file"] = dxpy.dxlink(file_id)
        if len(property_keys) != len(property_values):
            raise dxpy.AppError("Expected each provided property to have a corresponding value.")
        for i in range(len(property_keys)):
            details[property_keys[i]] = property_values[i]
        span.set_details(details)

        span.add_types(["Spans","gri"])
        span.rename(table_name)

        for line in bed:
            if line.startswith("track"):
                details = span.get_details()
                details['track'] = line
                span.set_details(details)
                continue
            line = line.rstrip("\n")
            line = line.split()
            if len(line) == 0:
                break
            if len(line) < 3:
                raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns.  Invalid BED file.")
            line[1] = int(line[1])
            line[2] = int(line[2])

            span.add_row(line)

    return dxpy.dxlink(span.get_id())
Exemple #15
0
 def test_add_rows_bad_data(self):
     self.dxgtable = dxpy.new_dxgtable([
             dxpy.DXGTable.make_column_desc("a", "string"),
             dxpy.DXGTable.make_column_desc("b", "float"),
             dxpy.DXGTable.make_column_desc("c", "int32"),
             dxpy.DXGTable.make_column_desc("d", "boolean"),
             ])
     # Wrong column types
     with self.assertRaises(ValueError):
         self.dxgtable.add_rows(data=[[303, 1.248, 123, True]], part=1) # Bad column 0
     with self.assertRaises(ValueError):
         self.dxgtable.add_rows(data=[["303", "1.248", 123, True]], part=2) # Bad column 1
     with self.assertRaises(ValueError):
         self.dxgtable.add_rows(data=[["303", 1.248, 123.5, True]], part=3) # Bad column 2
     with self.assertRaises(ValueError):
         self.dxgtable.add_rows(data=[["303", 1.248, 123, "True"]], part=4) # Bad column 3
     # Correct column types
     self.dxgtable.add_rows(data=[[u"303", 1.248, 123, True]], part=5)
     self.dxgtable.close(block=True)
Exemple #16
0
    def test_iter_table(self):
        self.dxgtable = dxpy.new_dxgtable(
            [dxpy.DXGTable.make_column_desc("a", "string"),
             dxpy.DXGTable.make_column_desc("b", "int32")])
        for i in range(64):
            self.dxgtable.add_rows(data=[["row"+str(i), i]], part=i+1)
        self.dxgtable.close(block=True)

        counter = 0
        for row in self.dxgtable:
            self.assertEqual(row[2], counter)
            counter += 1
        self.assertEqual(counter, 64)
        
        counter = 0
        for row in self.dxgtable.iterate_rows(start=1):
            self.assertEqual(row[2], counter+1)
            counter += 1
        self.assertEqual(counter, 63)

        counter = 0
        for row in self.dxgtable.iterate_rows(end=2):
            self.assertEqual(row[2], counter)
            counter += 1
        self.assertEqual(counter, 2)

        counter = 0
        for row in self.dxgtable.iterate_rows(start=1, end=63):
            self.assertEqual(row[2], counter+1)
            counter += 1
        self.assertEqual(counter, 62)

        counter = 0
        for row in self.dxgtable.iterate_rows(columns=['a'], start=1, end=63):
            counter += 1
        self.assertEqual(counter, 62)
Exemple #17
0
def main(**kwargs):
    if len(kwargs) == 0:
        args = parser.parse_args(sys.argv[1:])
    else:
        args = parser.parse_args(kwargs)

    try:
        process_dataobject_args(args)
    except Exception as details:
        parser.exit(1, unicode(details) + "\n")

    try:
        process_single_dataobject_output_args(args)
    except Exception as details:
        parser.exit(1, unicode(details) + "\n")

    if args.output is None:
        project = dxpy.WORKSPACE_ID
        folder = get_env_var("DX_CLI_WD", u"/")
        if args.filename != "-":
            name = os.path.basename(args.filename)
        else:
            name = None
    else:
        project, folder, name = resolve_path(args.output)
        if name is None and args.filename != "-":
            name = os.path.basename(args.filename)

    args.indices = [] if args.indices is None else json.loads(args.indices)
    if args.gri is not None:
        args.indices.append(dxpy.DXGTable.genomic_range_index(args.gri[0], args.gri[1], args.gri[2]))
        args.types = ["gri"] if args.types is None else args.types + ["gri"]

    if args.filename == "-":
        fd = sys.stdin
    else:
        try:
            fd = open(args.filename, "rb")
        except:
            parser.exit(1, fill(unicode("Could not open " + args.filename + " for reading")) + "\n")

    firstrow = fd.readline()

    if args.csv:
        delimiter = ","
        dialect = "excel"
    else:
        delimiter = "\t"
        dialect = "excel"
    # else:
    #     # Try to sniff the file format
    #     dialect = csv.Sniffer().sniff(firstrow)
    #     delimiter = dialect.delimiter
    firstrow_reader = csv.reader([firstrow], dialect=dialect, delimiter=delimiter)
    firstrow_data = firstrow_reader.next()
    reader = csv.reader(fd, dialect=dialect, delimiter=delimiter)

    column_specs = []
    types = []
    if args.columns is not None:
        specs = split_unescaped(",", args.columns)
    else:
        specs = firstrow_data
    for spec in specs:
        if ":" in spec:
            col_type = spec[spec.find(":") + 1 :]
            column_specs.append({"name": spec[: spec.find(":")], "type": col_type})
            if "int" in col_type:
                types.append("int")
            elif col_type == "boolean":
                types.append("boolean")
            elif col_type in ["float", "double"]:
                types.append("float")
            elif col_type == "string":
                types.append("string")
            else:
                parser.exit(1, "Unrecognized column type: " + col_type + "\n")
        else:
            column_specs.append({"name": spec, "type": "string"})
            types.append("string")
    try:
        dxgtable = dxpy.new_dxgtable(
            project=project,
            name=name,
            tags=args.tags,
            types=args.types,
            hidden=args.hidden,
            properties=args.properties,
            details=args.details,
            folder=folder,
            parents=args.parents,
            columns=column_specs,
            indices=args.indices,
        )
        if args.columns is not None:
            dxgtable.add_row([parse_item(firstrow_data[i], types[i]) for i in range(len(types))])
        for row in reader:
            dxgtable.add_row([parse_item(row[i], types[i]) for i in range(len(types))])
        dxgtable.close(block=args.wait)
        if args.brief:
            print(dxgtable.get_id())
        else:
            print_desc(dxgtable.describe(incl_properties=True, incl_details=True))
    except Exception as details:
        parser.exit(1, fill(unicode(details)) + "\n")
Exemple #18
0
def import_reads(job_input):

    global args

    if job_input == None:
        temp = vars(parser.parse_args(sys.argv[1:]))
        for key in temp:
            if temp[key] != None:
                if key == 'tags':
                    args[key] = temp[key].split(",")
                    # remove whitespace around tags
                    for i in range(len(args[key])):
                        args[key][i] = args[key][i].rstrip().lstrip()
                elif key == 'properties':
                    try:
                        args[key] = ast.literal_eval(temp[key])
                    except SyntaxError:
                        raise dxpy.AppError("Cannot parse properties: " +
                                            temp[key])
                else:
                    args[key] = temp[key]

    else:
        args = job_input

    print(args)

    if 'file2' in args:
        paired = True
    else:
        paired = False

    is_fasta, is_colorspace, qual_encoding = sniff_fastq(args["file"])

    if is_fasta == False and ('qual' in args or 'qual2' in args):
        raise dxpy.AppError(
            "Qualities supplied twice:  FASTQ format file found along with separate quality file."
        )

    if is_fasta and 'qual' not in args:
        reads_have_qualities = False
    else:
        reads_have_qualities = True

    table_columns = []

    if not args['discard_names']:
        table_columns.append(("name", "string"))
        if paired:
            table_columns.append(("name2", "string"))

    table_columns.append(("sequence", "string"))
    if paired:
        table_columns.append(("sequence2", "string"))
    if reads_have_qualities and not args['discard_qualities']:
        table_columns.append(("quality", "string"))
        if paired:
            table_columns.append(("quality2", "string"))

    column_descriptors = [
        dxpy.DXGTable.make_column_desc(name, type)
        for name, type in table_columns
    ]
    logging.info("Constructed table schema:  %s" % column_descriptors)

    readsTable = dxpy.new_dxgtable(column_descriptors)
    if is_colorspace:
        readsTable.add_types(['ColorReads', 'Reads'])
        details = readsTable.get_details()
        details['sequence_type'] = "color"
        readsTable.set_details(details)
    else:
        readsTable.add_types(['LetterReads', 'Reads'])

    if 'tags' in args:
        readsTable.add_tags(args['tags'])
    if 'properties' in args:
        readsTable.set_properties(args['properties'])

    if paired:
        details = readsTable.get_details()
        details['paired'] = True

        # TODO implement estimate paired read distance

        # otherwise take the values they give
        if 'pair_orientation' in args:
            details['pair_orientation'] = args['pair_orientation']
        if 'pair_min_dist' in args:
            details['pair_min_dist'] = args['pair_min_dist']
        if 'pair_max_dist' in args:
            details['pair_max_dist'] = args['pair_max_dist']
        if 'pair_avg_dist' in args:
            details['pair_avg_dist'] = args['pair_avg_dist']
        if 'pair_std_dev_dist' in args:
            details['pair_std_dev_dist'] = args['pair_std_dev_dist']

        readsTable.set_details(details)

    # generate translation table for enforcing string syntax
    to_replace = ''.join([".", "-"])
    N = ''.join(['N'] * len(to_replace))
    transtable = string.maketrans(to_replace, N)

    for name1, seq1, qual1, name2, seq2, qual2 in iterate_reads(
            fastqa1_filename=args["file"],
            fastqa2_filename=args["file2"] if 'file2' in args else None,
            qual1_filename=args["qual"] if 'qual' in args else None,
            qual2_filename=args["qual2"] if 'qual2' in args else None,
            is_fasta=is_fasta,
            is_colorspace=is_colorspace,
            qual_encoding=qual_encoding):

        row = []
        # add name
        if args['discard_names'] == False:
            if is_fasta and name1[0] == '>':
                name1 = name1[1:]
            elif name1[0] == '@':
                name1 = name1[1:]
            row.append(name1)
            if paired:
                if is_fasta and name2[0] == '>':
                    name2 = name2[1:]
                elif name2[0] == '@':
                    name2 = name2[1:]
                row.append(name2)

        # enforce UPPERCASE
        seq1 = seq1.upper()
        if paired:
            seq2 = seq2.upper()

        # translate bad chars into Ns
        if not is_colorspace:
            seq1 = seq1.translate(transtable)
            if paired:
                seq2 = seq2.translate(transtable)

        # add seq
        row.append(seq1)
        if paired:
            row.append(seq2)

        # add quals
        if reads_have_qualities and not args['discard_qualities']:
            row.append(qual1)
            if paired:
                row.append(qual2)

        readsTable.add_row(row)

    # print out table ID
    print(json.dumps({'table_id': readsTable.get_id()}))

    if 'name' in args:
        tableName = args['name']
    else:
        tableName = remove_file_type(args['file']) + " reads"

    readsTable.rename(tableName)

    # set link to original FASTQ file object
    details = readsTable.get_details()

    if 'file_link' in args:
        details['original_files'] = [args['file_link']]
        if 'file2' in args:
            details['original_files'].append(args['file2_link'])
        if 'qual' in args:
            details['original_files'].append(args['qual_link'])
            if 'file2' in args:
                assert ('qual2' in args)
                details['original_files'].append(args['qual2_link'])

    readsTable.set_details(details)

    readsTable.close()

    # place table in output
    return {'reads': dxpy.dxlink(readsTable.get_id())}
def main(**job_inputs):
    job_outputs = {}
    mappingsTable = dxpy.open_dxgtable(job_inputs["mappings"]["$dnanexus_link"])
    mappingsTableId = mappingsTable.get_id()

    # This controls the degree of parallelism
    chunks = int(mappingsTable.describe()["length"] / job_inputs["reads_per_job"]) + 1

    try:
        contigSetId = mappingsTable.get_details()["original_contigset"]["$dnanexus_link"]
        originalContigSet = mappingsTable.get_details()["original_contigset"]
    except:
        raise Exception("The original reference genome must be attached as a detail")

    # In the next major section of code, we construct a variants table. As regions of the genome are passed to each worker
    # and variants are called on them, the workers will add rows to this table concurrently.

    variants_schema = [
        {"name": "chr", "type": "string"},
        {"name": "lo", "type": "int32"},
        {"name": "hi", "type": "int32"},
        {"name": "ref", "type": "string"},
        {"name": "alt", "type": "string"},
        {"name": "qual", "type": "double"},
        {"name": "ids", "type": "string"},
    ]

    # The information in these tags is elevated into specific columns, so additional columns for these tags will not be created
    elevatedTags = ["format_GT", "format_DP", "format_AD"]

    # The info and format tags are extracted from the header printed by samtools
    # If additional code will add a tag to the output of the program, modify this header to include the tag.
    # TODO: Allow the table to be created by the first job that finishes to avoid this step.
    headerInfo = extractHeader("/tmp/header.txt", elevatedTags)
    description = {}
    samples = []

    indices = [dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")]

    ##The following section creates the sample-specific table columns
    for k, v in headerInfo["tags"]["info"].iteritems():
        variants_schema.append({"name": "info_" + k, "type": translateTagTypeToColumnType(v)})
        description[k] = {"name": k, "description": v["description"], "type": v["type"], "number": v["number"]}

    # For each sample, add the sample-specific columns to the schema, at present only one sample is supported
    numSamples = 1
    for i in range(numSamples):
        variants_schema.extend(
            [
                {"name": "genotype_" + str(i), "type": "string"},
                {"name": "phasing_" + str(i), "type": "string"},
                {"name": "type_" + str(i), "type": "string"},
                {"name": "variation_qual_" + str(i), "type": "double"},
                {"name": "genotype_qual_" + str(i), "type": "double"},
                {"name": "coverage_" + str(i), "type": "string"},
                {"name": "total_coverage_" + str(i), "type": "int32"},
            ]
        )
        indices.append(dxpy.DXGTable.lexicographic_index([["type_" + str(i), "ASC"]], "type_" + str(i)))
        samples.append("Sample_0")
        for k, v in headerInfo["tags"]["format"].iteritems():
            if "format_" + k not in elevatedTags:
                variants_schema.append({"name": "format_" + k + "_" + str(i), "type": translateTagTypeToColumnType(v)})

    # TODO: Add lexicographic indices when secondary indices are supported

    variants = dxpy.new_dxgtable(variants_schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")])
    tableId = variants.get_id()
    variants = dxpy.open_dxgtable(tableId)
    variants.add_types(["Variants", "gri"])

    details = {
        "samples": samples,
        "original_contigset": job_inputs["reference"],
        "original_mappings": job_inputs["mappings"],
        "formats": headerInfo["tags"]["format"],
        "infos": headerInfo["tags"]["info"],
    }
    # if headerInfo.get('filters') != {}:
    #  details['filters'] = headerInfo['filters']
    variants.set_details(details)

    if "output_name" in job_inputs:
        variants.rename(job_inputs["output_name"])
    else:
        variants.rename(mappingsTable.describe()["name"] + " variant calls by Samtools mpileup")

    # Split the genome into evenly sized regions
    genomeRegions = splitGenomeLengthLargePieces(originalContigSet, chunks)

    # Generate the command line arguments needed to run samtools and bcftools
    samOptions = makeSamtoolsParameters(**job_inputs)
    bcfOptions = makeBcftoolsParameters(**job_inputs)

    # The rest of the main function contains the map-reduce functionality. For each genome chunk, an input spec is created for a new child job.
    # Which specifies
    reduce_job_inputs = {}
    for i in range(len(genomeRegions)):
        if len(genomeRegions[i]) > 0:
            map_job_inputs = {
                "mappings_table_id": mappingsTableId,
                "original_contig_set": contigSetId,
                "interval": genomeRegions[i],
                "tableId": tableId,
                "compress_reference": job_inputs["compress_reference"],
                "compress_no_call": job_inputs["compress_no_call"],
                "infer_no_call": job_inputs["infer_no_call"],
                "sam_options": samOptions,
                "bcf_options": bcfOptions,
                "part_number": i,
            }
            # Run a "map" job for each chunk, passing in the inputspec from above and looking for a function entry point given as "map" (@dxpy.entry_point('map'))
            map_job = dxpy.new_dxjob(map_job_inputs, "map")
            reduce_job_inputs["mapJob" + str(i) + "TableId"] = {"job": map_job.get_id(), "field": "ok"}

    reduce_job_inputs["tableId"] = tableId

    # Run a "reduce" job, which only begins once all of the map jobs singal they have completed by sending 'ok':True
    # The reduce job closes the table. This step is explicitly needed because table closing must wait till the completion of the map jobs
    # By giving the reduce job the map jobs as input, the reduce job will wait to start.
    reduce_job = dxpy.new_dxjob(reduce_job_inputs, "reduce")
    job_outputs = {"variants": {"job": reduce_job.get_id(), "field": "variants"}}

    return job_outputs
def import_reads(job_input):

    global args

    if job_input == None:
        temp = vars(parser.parse_args(sys.argv[1:]))
        for key in temp:
            if temp[key] != None:
                if key == 'tags':
                    args[key] = temp[key].split(",")
                    # remove whitespace around tags
                    for i in range(len(args[key])):
                        args[key][i] = args[key][i].rstrip().lstrip()
                elif key == 'properties':
                    try:
                        args[key] = ast.literal_eval(temp[key])
                    except SyntaxError:
                        raise dxpy.AppError("Cannot parse properties: " + temp[key])
                else:
                    args[key] = temp[key]

    else:
        args = job_input

    print(args)
    
    if 'file2' in args:
        paired = True
    else:
        paired = False
   
    is_fasta, is_colorspace, qual_encoding = sniff_fastq(args["file"])
    
    if is_fasta == False and ('qual' in args or 'qual2' in args):
        raise dxpy.AppError("Qualities supplied twice:  FASTQ format file found along with separate quality file.")

    if is_fasta and 'qual' not in args:
        reads_have_qualities = False
    else:
        reads_have_qualities = True

    table_columns = []

    if not args['discard_names']:
        table_columns.append(("name", "string"))
        if paired:
            table_columns.append(("name2", "string"))

    table_columns.append(("sequence", "string"))
    if paired:
        table_columns.append(("sequence2", "string"))
    if reads_have_qualities and not args['discard_qualities']:
        table_columns.append(("quality", "string"))
        if paired:
            table_columns.append(("quality2", "string"))
    
    column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in table_columns]
    logging.info("Constructed table schema:  %s" % column_descriptors)

    readsTable = dxpy.new_dxgtable(column_descriptors)
    if is_colorspace:
        readsTable.add_types(['ColorReads', 'Reads'])
        details =  readsTable.get_details()
        details['sequence_type'] = "color"
        readsTable.set_details(details)
    else:
        readsTable.add_types(['LetterReads', 'Reads'])

    if 'tags' in args:
        readsTable.add_tags(args['tags'])
    if 'properties' in args:
        readsTable.set_properties(args['properties'])


    if paired:
        details = readsTable.get_details()
        details['paired'] = True

        # TODO implement estimate paired read distance

        # otherwise take the values they give
        if 'pair_orientation' in args:
            details['pair_orientation'] = args['pair_orientation']
        if 'pair_min_dist' in args:
            details['pair_min_dist'] = args['pair_min_dist']
        if 'pair_max_dist' in args:
            details['pair_max_dist'] = args['pair_max_dist']
        if 'pair_avg_dist' in args:
            details['pair_avg_dist'] = args['pair_avg_dist']
        if 'pair_std_dev_dist' in args:
            details['pair_std_dev_dist'] = args['pair_std_dev_dist']

        readsTable.set_details(details)


    # generate translation table for enforcing string syntax
    to_replace=''.join([".","-"])
    N=''.join(['N'] * len(to_replace))
    transtable = string.maketrans( to_replace, N )

    for name1, seq1, qual1, name2, seq2, qual2 in iterate_reads(fastqa1_filename=args["file"],
                                                                fastqa2_filename=args["file2"] if 'file2' in args else None,
                                                                qual1_filename=args["qual"] if 'qual' in args else None,
                                                                qual2_filename=args["qual2"] if 'qual2' in args else None,
                                                                is_fasta=is_fasta,
                                                                is_colorspace=is_colorspace,
                                                                qual_encoding=qual_encoding):
        
        row = []
        # add name
        if args['discard_names'] == False:
            if is_fasta and name1[0] == '>':
                name1 = name1[1:]
            elif name1[0] == '@':
                name1 = name1[1:]
            row.append(name1)
            if paired:
                if is_fasta and name2[0] == '>':
                    name2 = name2[1:]
                elif name2[0] == '@':
                    name2 = name2[1:]
                row.append(name2)
    
        # enforce UPPERCASE
        seq1 = seq1.upper()
        if paired:
            seq2 = seq2.upper()
        
        # translate bad chars into Ns
        if not is_colorspace:
            seq1 = seq1.translate(transtable)
            if paired:
                seq2 = seq2.translate(transtable)

        # add seq
        row.append(seq1)
        if paired:
            row.append(seq2)

        # add quals
        if reads_have_qualities and not args['discard_qualities']:
            row.append(qual1)
            if paired:
                row.append(qual2)
        
        readsTable.add_row(row)

    # print out table ID
    print(json.dumps({'table_id': readsTable.get_id()}))

    if 'name' in args:
        tableName = args['name']
    else:
        tableName = remove_file_type(args['file']) + " reads"
        
    readsTable.rename(tableName)

    # set link to original FASTQ file object
    details = readsTable.get_details()

    if 'file_link' in args:
        details['original_files'] = [ args['file_link'] ]
        if 'file2' in args:
            details['original_files'].append(args['file2_link'])
        if 'qual' in args:
            details['original_files'].append(args['qual_link'])
            if 'file2' in args:
                assert('qual2' in args)
                details['original_files'].append(args['qual2_link'])

    readsTable.set_details(details)

    readsTable.close()

    # place table in output
    return {'reads': dxpy.dxlink(readsTable.get_id())}
Exemple #21
0
def main(**job_inputs):

    job_outputs = {}
    header = ''

    print "Downloading input VCF file"
    inputFile = dxpy.download_dxfile(job_inputs['vcf'], 'output.file')

    decompressFile('output.file')

    print "Constructing table schema"
    elevatedTags = ['format_GT', 'format_DP', 'format_AD']
    headerInfo = extractHeader('output.vcf', elevatedTags)

    variants_schema = [
      {"name": "chr", "type": "string"},
      {"name": "lo", "type": "int32"},
      {"name": "hi", "type": "int32"},
      {"name": "ref", "type": "string"},
      {"name": "alt", "type": "string"},
      {"name": "qual", "type": "double"},
      {"name": "ids", "type": "string"}
         ]

    description = {}
    samples = []

    if headerInfo.get('filters') != {}:
      variants_schema.append({"name": "filter", "type": "string"})
    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')]

    formats = {}
    infos = {}
    filters = {}

    for k, v in headerInfo['tags']['info'].iteritems():
        variants_schema.append({"name": "info_"+k, "type":translateTagTypeToColumnType(v)})
        description[k] = {'name' : k, 'description' : v['description'], 'type' : v['type'], 'number' : v['number']}

    numSamples = len(headerInfo['columns'].strip().split("\t")[9:])
    if numSamples > 10:
      raise dxpy.AppError("The VCF file contained too many samples, can't import a VCF containing more than 10 samples")
    if job_inputs['searchable_ids']:
      indices.append(dxpy.DXGTable.lexicographic_index([
        dxpy.DXGTable.lexicographic_index_column("ids", True, False),
        dxpy.DXGTable.lexicographic_index_column("chr"),
        dxpy.DXGTable.lexicographic_index_column("lo"),
        dxpy.DXGTable.lexicographic_index_column("hi")], "search"))
    #For each sample, write the sample-specific columns
    for i in range(len(headerInfo['columns'].strip().split("\t")[9:])):
      #This prevents name collision in columns
      variants_schema.extend([
        {"name": "genotype_"+str(i), "type": "string"},
        {"name": "phasing_"+str(i), "type": "string"},
        {"name": "type_"+str(i), "type": "string"},
        {"name": "variation_qual_"+str(i), "type": "double"},
        {"name": "genotype_qual_"+str(i), "type": "double"},
        {"name": "coverage_"+str(i), "type": "string"},
        {"name": "total_coverage_"+str(i), "type": "int32"}
      ])
      #indices.append(dxpy.DXGTable.lexicographic_index([["type_"+str(i), "ASC"]], 'type_'+str(i)))
      samples.append(headerInfo['columns'].strip().split("\t")[9:][i])
      for k, v in headerInfo['tags']['format'].iteritems():
        if "format_"+k not in elevatedTags:
          variants_schema.append({"name": "format_"+k+"_"+str(i), "type":translateTagTypeToColumnType(v)})


    if 'output_name' in job_inputs:
        name =  job_inputs['output_name']
    else:
        fileName = dxpy.DXFile(job_inputs['vcf']['$dnanexus_link']).describe()['name']
        name = fileName.split(".")[0]
        for x in fileName.split(".")[1:-1]:
            name += "."+x

    details = {'samples':samples, 'original_contigset':job_inputs['reference'], 'original_file':job_inputs['vcf'], 'formats':headerInfo['tags']['format'], 'infos':headerInfo['tags']['info'], 'alts':headerInfo['tags']['alt']}
    if headerInfo.get('filters') != {}:
      details['filters'] = headerInfo['filters']

    table = dxpy.new_dxgtable(variants_schema, indices=indices)
    table.set_details(details)
    types = ["Variants", "gri"]
    if 'additional_types' in job_inputs:
      for x in job_inputs['additional_types'].split(","):
        if x != '':
          types.append(x)
    table.add_types(types)

    if 'tags' in job_inputs:
        table.add_tags(job_inputs['tags'])
    if 'properties' in job_inputs:
        table.set_properties(job_inputs['properties'])


    table.rename(name)

    command = "dx_vcfToVariants2"
    command += " --table_id " + str(table.get_id())
    command += " --vcf_file output.vcf"
    if job_inputs['compress_reference']:
        command += " --compress_reference"
    if job_inputs['infer_no_call']:
        command += " --infer_no_call"
    if job_inputs['compress_no_call']:
      command += " --compress_no_call"
    command += " --encoding "+job_inputs["file_encoding"]

    print "Importing variants by running:", command
    try:
      subprocess.check_call(command, shell=True)
    except subprocess.CalledProcessError as e:
      try:
        errorData = open("AppError.txt", 'r').read()
        raise dxpy.AppError(errorData)
      except IOError:
        raise dxpy.AppError("An unknown error occurred. Please check the log file")

    attach_empty_trackspec(table)
    table.close()
    result = dxpy.dxlink(table.get_id())

    job_outputs['variants'] = result
    return job_outputs
Exemple #22
0
def main(BAM, reference, mb_per_chunk):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #BAM = dxpy.DXFile(BAM)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    #dxpy.download_dxfile(BAM.get_id(), "input.bam")

    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    chunks = int(dxpy.DXFile(BAM).describe()['size']/(1000000*mb_per_chunk))
    if chunks < 1:
        chunks = 1

    #subprocess.check_call("samtools view input.bam -H -o header.txt", shell=True)
    #chromosomes  = re.findall("SN:([^\t]*)", line.strip())

    schema = [
        {"name": "sequence", "type":"string"},
        {"name": "chr", "type": "string"},
        {"name": "lo", "type": "int32"},
        {"name": "hi", "type": "int32"},
        {"name": "negative_strand", "type": "boolean"},
        {"name": "cigar", "type": "string"}
         ]

    mappingsTable = dxpy.new_dxgtable(schema, indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", "gri")])
    mappingsTable.add_types(["Mappings"])

    mappingsTable.set_details({"original_contigset":dxpy.dxlink(reference)})

    subjobs = []
    for i in range(chunks):

        #subprocess.check_call("samtools view -b input.bam -F 4 -o subset.bam %s" % (" ".join(chromosomes[i::chunks])), shell=True)
        #jobFile = dxpy.upload_local_file("subset.bam").get_id()

        subjob_input = { "tableId": mappingsTable.get_id(),
                         "BAM": BAM,
                         "job_id": i,
                         "chunks": chunks}
        subjobs.append(dxpy.new_dxjob(subjob_input, 'process'))

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  If you give it any inputs that
    # use outputs from the "process" jobs, then it will automatically
    # wait for those jobs to finish before it starts running.  If you
    # do not need to give it any such inputs, you can explicitly state
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either DXJob objects are string job IDs in the list).

    postprocess_job = dxpy.new_dxjob(fn_input={ "process_outputs": [subjob.get_output_ref("output") for subjob in subjobs] },
                                     fn_name='postprocess',
                                     depends_on=subjobs)

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a reference.  If the output field in the postprocess
    # function is called "answer", you can pass that on here as
    # follows:
    #
    # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...}

    output = {'mappings': dxpy.dxlink(mappingsTable.get_id())}

    return output
def import_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, isBedDetail, delimiter="\t"):
    num_cols = find_num_columns(bed_file, delimiter)

    # if this is a bedDetail file we should treat the last two columns separately
    if isBedDetail:
        num_cols -= 2
    
    possible_columns = [("chr", "string"),
                        ("lo", "int32"),
                        ("hi", "int32"),
                        ("name", "string"),
                        ("score", "float"),
                        ("strand", "string"),
                        ("thick_start", "int32"),
                        ("thick_end", "int32"),
                        ("item_rgb", "string")]

    bedDetail_columns = [("bedDetail_ID", "string"),
                         ("bedDetail_desc", "string")]

    possible_default_row = ["", 0, 0, "", 0, ".", 0, 0, ""]

    columns = possible_columns[:num_cols]

    if isBedDetail:
        columns.extend(bedDetail_columns)

    if num_cols > len(columns):
        for i in range(len(columns), num_cols):
            columns.append(("BED_column_"+str(i+1), "string"))
            possible_default_row.append("")

    default_row = possible_default_row[:num_cols]

    if isBedDetail:
        default_row.extend(["",""])

    column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns]
    
    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')]
    for c in columns:
        if "name" in c:
            indices.append(dxpy.DXGTable.lexicographic_index([
                              dxpy.DXGTable.lexicographic_index_column("name", True, False),
                              dxpy.DXGTable.lexicographic_index_column("chr"),
                              dxpy.DXGTable.lexicographic_index_column("lo"),
                              dxpy.DXGTable.lexicographic_index_column("hi")], "search"))
            break
            
    with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span:
        details = {"original_contigset": dxpy.dxlink(ref_id)}
        if file_id != None:
            details["original_file"] = dxpy.dxlink(file_id)
        if len(property_keys) != len(property_values):
            raise dxpy.AppError("Expected each provided property to have a corresponding value.")
        for i in range(len(property_keys)):
            details[property_keys[i]] = property_values[i]    
    
        span.set_details(details)

        span.add_types(["Spans", "gri"])
        span.rename(table_name)

        for line in bed:
            row = list(default_row)

            if line.startswith("track"):
                details = span.get_details()
                details['track'] = line
                span.set_details(details)
                continue
            line = line.rstrip("\n")
            line = line.split(delimiter)
            if isBedDetail:
                # only the first 4 columns are guaranteed to be defined by UCSC
                validate_line(line[:4])
                # save last two fields separately
                bedDetailFields = line[-2:]
                line = line[:-2]     
            else:        
                validate_line(line[:num_cols])
            
            # check to see if this is a weird line
            if len(line) == 0:
                break
            if len(line) < 3:
                raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns.  Invalid BED file.")

            try:
                row[0] = line[0]
                row[1] = int(line[1])
                row[2] = int(line[2])
                row[3] = line[3]
                # dashes are sometimes used when field is invalid
                if line[4] == "-" or line[4] == ".":
                    line[4] = 0
                row[4] = float(line[4])
                row[5] = line[5]
                # dashes are sometimes used when field is invalid
                if line[6] == "-" or line[6] == ".":
                    line[6] = 0
                row[6] = int(line[6])
                # dashes are sometimes used when field is invalid
                if line[7] == "-" or line[7] == ".":
                    line[7] = 0
                row[7] = int(line[7])
                row[8] = line[8]

            # an index error would come from having fewer columns in a row, which we should handle ok
            except IndexError:
                pass
            # value error when fields are messed up and string gets converted to int, etc.  Throw these out.
            except ValueError:
                continue
            
            if isBedDetail:
                # add these in at the end if we have a bedDetail file
                row[num_cols] = bedDetailFields[0]
                row[num_cols+1] = bedDetailFields[1]
            
            span.add_row(row)

        span.flush()

    return dxpy.dxlink(span.get_id())
def main(DX_APP_WIZARD_INPUT_SIGNATURE):
DX_APP_WIZARD_INITIALIZE_INPUTDX_APP_WIZARD_DOWNLOAD_ANY_FILES
    # First, create the output GTable that will contain your results.
    # NOTE: You must specify the columns and indices for a GTable when
    # you create it, and they are immutable thereafter.
    #
    # Note: If you are filtering a GTable or are otherwise happy with
    # using the same exact columns and indices as your input GTable,
    # you can easily initialize your new GTable as follows:
    #
    # DX_APP_WIZARD_||_OUTPUT = dxpy.new_dxgtable(init_from=DX_APP_WIZARD_||_INPUT)
    #
    # In the more general case, you may want to specify different
    # columns.  The following lines assume you would like to create a
    # GTable with a genomic range index, i.e. there is a string column
    # for chromosome names and two integer columns for low and high
    # coordinates.

    columns = [dxpy.DXGTable.make_column_desc("chr", "string"),
               dxpy.DXGTable.make_column_desc("lo", "int"),
               dxpy.DXGTable.make_column_desc("hi", "int"),
               dxpy.DXGTable.make_column_desc("somedata", "string")]
    DX_APP_WIZARD_||_OUTPUT = dxpy.new_dxgtable(columns=columns,
                                                          indices=[dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")])

    # Split your input to be solved by the next stage of your app.
    # The following assumes you are splitting the input by giving
    # 100000 rows of a GenomicTable per subjob running the
    # "process" entry point.

    num_rows = DX_APP_WIZARD_||_INPUT.describe()["length"]

    subjobs = []
    for i in range(num_rows / row_chunk_size + (0 if num_rows % row_chunk_size == 0 else 1)):
        subjob_input = { "input_gtable_id": DX_APP_WIZARD_||_INPUT.get_id(),
                         "start_row": row_chunk_size * i,
                         "end_row": min(row_chunk_size * (i + 1), num_rows),
                         "output_gtable_id": DX_APP_WIZARD_||_OUTPUT.get_id()}
        subjobs.append(dxpy.new_dxjob(subjob_input, "process"))

    # The next line creates the job that will perform the
    # "postprocess" step of your app.  It assumes that you do not need
    # to aggregate any output from your "process" stages (other than
    # closing the output GTable), but you can add the output of those
    # stages to the input of your "postprocess" stage easily by adding
    # the following value as a field in the "fn_input" dict and adding
    # the parameter to your "postprocess" entry point.
    #
    #   fn_input={"process_outputs": [subjob.get_output_ref("output") for subjob in subjobs], ...}
    #
    # With no other input other than the output GTable ID for the
    # "postprocess" stage, we will force it to run only after all the
    # "process" stages have finished running by providing the list of
    # their DXJob handlers to the "depends_on" field (it accepts
    # either dxpy handlers or string IDs in the list).

    postprocess_job = dxpy.new_dxjob(fn_input={ "output_gtable_id": DX_APP_WIZARD_||_OUTPUT.get_id() },
                                     fn_name="postprocess",
                                     depends_on=subjobs)

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field is
    # called "answer", you can pass that on here as follows:
    #
    # return {"app_output_field": postprocess_job.get_output_ref("answer"), ...}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as GTables) which are closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.

    output = {}
def main(**kwargs):
    if len(kwargs) == 0:
        args = parser.parse_args(sys.argv[1:])
    else:
        args = parser.parse_args(kwargs)

    try:
        process_dataobject_args(args)
    except BaseException as details:
        parser.exit(1, unicode(details) + '\n')

    if args.output is None:
        project = dxpy.WORKSPACE_ID
        folder = os.environ.get('DX_CLI_WD', '/')
        if args.filename != '-':
            name = os.path.basename(args.filename)
        else:
            name = None
    else:
        project, folder, name = resolve_path(args.output)
        if name is None and args.filename != '-':
            name = os.path.basename(args.filename)

    args.indices = [] if args.indices is None else json.loads(args.indices)
    if args.gri is not None:
        args.indices.append(dxpy.DXGTable.genomic_range_index(args.gri[0], args.gri[1], args.gri[2]))
        args.types = ['gri'] if args.types is None else args.types + ['gri']

    if args.filename == '-':
        fd = sys.stdin
    else:
        try:
            fd = open(args.filename, 'rb')
        except:
            parser.exit(1, fill(unicode('Could not open ' + args.filename + ' for reading')) + '\n')

    firstrow = fd.readline()

    if args.csv:
        delimiter = ','
        dialect = 'excel'
    else:
        delimiter = '\t'
        dialect = 'excel'
    # else:
    #     # Try to sniff the file format
    #     dialect = csv.Sniffer().sniff(firstrow)
    #     delimiter = dialect.delimiter
    firstrow_reader = csv.reader([firstrow], dialect=dialect,
                                 delimiter=delimiter)
    firstrow_data = firstrow_reader.next()
    reader = csv.reader(fd, dialect=dialect,
                        delimiter=delimiter)

    column_specs = []
    types = []
    if args.columns is not None:
        specs = split_unescaped(',', args.columns)
    else:
        specs = firstrow_data
    for spec in specs:
        if ':' in spec:
            col_type = spec[spec.find(':') + 1:]
            column_specs.append({'name': spec[:spec.find(':')],
                                 'type': col_type})
            if 'int' in col_type:
                types.append('int')
            elif col_type == 'boolean':
                types.append('boolean')
            elif col_type in ['float', 'double']:
                types.append('float')
            elif col_type == 'string':
                types.append('string')
            else:
                parser.exit(1, 'Unrecognized column type: ' + col_type + '\n')
        else:
            column_specs.append({'name': spec,
                                 'type': 'string'})
            types.append('string')
    try:
        dxgtable = dxpy.new_dxgtable(project=project, name=name,
                                     tags=args.tags, types=args.types, 
                                     hidden=args.hidden, properties=args.properties,
                                     details=args.details,
                                     folder=folder,
                                     parents=args.parents,
                                     columns=column_specs,
                                     indices=args.indices)
        if args.columns is not None:
            dxgtable.add_row([ parse_item(firstrow_data[i], types[i]) for i in range(len(types))])
        for row in reader:
            dxgtable.add_row([ parse_item(row[i], types[i]) for i in range(len(types))])
        dxgtable.close(block=args.wait)
        if args.brief:
            print dxgtable.get_id()
        else:
            print_desc(dxgtable.describe(incl_properties=True, incl_details=True))
    except BaseException as details:
        parser.exit(1, fill(unicode(details)) + '\n')
def main(**kwargs):
    if len(kwargs) == 0:
        args = parser.parse_args(sys.argv[1:])
    else:
        args = parser.parse_args(kwargs)

    try:
        process_dataobject_args(args)
    except Exception as details:
        parser.exit(1, unicode(details) + '\n')

    try:
        process_single_dataobject_output_args(args)
    except Exception as details:
        parser.exit(1, unicode(details) + '\n')

    if args.output is None:
        project = dxpy.WORKSPACE_ID
        folder = dxpy.config.get('DX_CLI_WD', u'/')
        if args.filename != '-':
            name = os.path.basename(args.filename)
        else:
            name = None
    else:
        project, folder, name = resolve_path(args.output)
        if name is None and args.filename != '-':
            name = os.path.basename(args.filename)

    args.indices = [] if args.indices is None else json.loads(args.indices)
    if args.gri is not None:
        args.indices.append(
            dxpy.DXGTable.genomic_range_index(args.gri[0], args.gri[1],
                                              args.gri[2]))
        args.types = ['gri'] if args.types is None else args.types + ['gri']

    if args.filename == '-':
        fd = sys.stdin
    else:
        try:
            fd = open(args.filename, 'rb')
        except:
            parser.exit(
                1,
                fill(
                    unicode('Could not open ' + args.filename +
                            ' for reading')) + '\n')

    firstrow = fd.readline()

    if args.csv:
        delimiter = ','
        dialect = 'excel'
    else:
        delimiter = '\t'
        dialect = 'excel'
    # else:
    #     # Try to sniff the file format
    #     dialect = csv.Sniffer().sniff(firstrow)
    #     delimiter = dialect.delimiter
    firstrow_reader = csv.reader([firstrow],
                                 dialect=dialect,
                                 delimiter=delimiter)
    firstrow_data = firstrow_reader.next()
    reader = csv.reader(fd, dialect=dialect, delimiter=delimiter)

    column_specs = []
    types = []
    if args.columns is not None:
        specs = split_unescaped(',', args.columns)
    else:
        specs = firstrow_data
    for spec in specs:
        if ':' in spec:
            col_type = spec[spec.find(':') + 1:]
            column_specs.append({
                'name': spec[:spec.find(':')],
                'type': col_type
            })
            if 'int' in col_type:
                types.append('int')
            elif col_type == 'boolean':
                types.append('boolean')
            elif col_type in ['float', 'double']:
                types.append('float')
            elif col_type == 'string':
                types.append('string')
            else:
                parser.exit(1, 'Unrecognized column type: ' + col_type + '\n')
        else:
            column_specs.append({'name': spec, 'type': 'string'})
            types.append('string')
    try:
        dxgtable = dxpy.new_dxgtable(project=project,
                                     name=name,
                                     tags=args.tags,
                                     types=args.types,
                                     hidden=args.hidden,
                                     properties=args.properties,
                                     details=args.details,
                                     folder=folder,
                                     parents=args.parents,
                                     columns=column_specs,
                                     indices=args.indices)
        if args.columns is not None:
            dxgtable.add_row([
                parse_item(firstrow_data[i], types[i])
                for i in range(len(types))
            ])
        for row in reader:
            dxgtable.add_row(
                [parse_item(row[i], types[i]) for i in range(len(types))])
        dxgtable.close(block=args.wait)
        if args.brief:
            print(dxgtable.get_id())
        else:
            print_desc(
                dxgtable.describe(incl_properties=True, incl_details=True))
    except Exception as details:
        parser.exit(1, fill(unicode(details)) + '\n')
Exemple #27
0
def main(**job_inputs):
    job_outputs = {}
    reads_inputs = job_inputs['reads']
    reads_ids = [r['$dnanexus_link'] for r in reads_inputs]
    reads_descriptions = {r: dxpy.DXGTable(r).describe() for r in reads_ids}
    reads_columns = {r: [col['name'] for col in desc['columns']] for r, desc in reads_descriptions.items()}

    print reads_inputs
    print reads_ids
    print reads_descriptions
    print reads_columns

    all_reads_have_FlowReads_tag = all(['FlowReads' in desc['types'] for desc in reads_descriptions.values()])
    all_reads_have_LetterReads_tag = all(['LetterReads' in desc['types'] for desc in reads_descriptions.values()])
    reads_have_names = any(['name' in columns for columns in reads_columns.values()])
    reads_are_paired = any(['sequence2' in columns for columns in reads_columns.values()])
    reads_have_qualities = any(['quality' in columns for columns in reads_columns.values()])
    if reads_have_qualities:
        assert(all(['quality' in columns for columns in reads_columns.values()]))
    if reads_are_paired:
        all_paired = all(['sequence2' in columns for columns in reads_columns.values()])
        if not all_paired:
            raise dxpy.AppError("Reads to be mapped must be either all paired or all unpaired.  App input contains both paired and unpaired reads.")

    if job_inputs["algorithm"] == "bwasw":
        assert(not reads_are_paired) # bwasw does not support paired inputs

    assert(all_reads_have_FlowReads_tag or all_reads_have_LetterReads_tag)

    reference_record_types = dxpy.describe(job_inputs['reference'])['types']
    if "BwaLetterContigSetV3" in reference_record_types:
        input_ref_is_indexed = True
    elif "ContigSet" in reference_record_types:
        input_ref_is_indexed = False
    else:
        raise dxpy.ProgramError("Unrecognized object passed as reference. It must be a ContigSet record or a BwaLetterContigSetV3 file")

    if input_ref_is_indexed:
        job_outputs['indexed_reference'] = job_inputs['reference']
    else:
        found_cached_idx = False
        for result in dxpy.find_data_objects(classname='record',
                                             typename='BwaLetterContigSetV3',
                                             link=job_inputs['reference']['$dnanexus_link']):
            job_outputs['indexed_reference'] = dxpy.dxlink(result['id'])
            found_cached_idx = True
            break
        if not found_cached_idx:
            job_outputs['indexed_reference'] = dxpy.dxlink(make_indexed_reference(job_inputs))

    table_columns = [("sequence", "string")]
    if reads_have_names:
        table_columns.append(("name", "string"))
    if reads_have_qualities:
        table_columns.append(("quality", "string"))
    table_columns.extend([("status", "string"),
                          ("chr", "string"),
                          ("lo", "int32"),
                          ("hi", "int32"),
                          ("negative_strand", "boolean"),
                          ("error_probability", "uint8"),
                          ("qc_fail", "boolean"),
                          ("duplicate", "boolean"),
                          ("cigar", "string"),
                          ("template_id", "int64"),
                          ("read_group", "int32")])

    # optional sam fields: RG BC XC XT NM CM XN SM AM XM X0 X1 XG MD XA

    if reads_are_paired:
        table_columns.extend([("mate_id", "int32"), # TODO: int8
                              ("status2", "string"),
                              ("chr2", "string"),
                              ("lo2", "int32"),
                              ("hi2", "int32"),
                              ("negative_strand2", "boolean"),
                              ("proper_pair", "boolean")])

    if all_reads_have_FlowReads_tag:
        table_columns.extend([("flowgram", "string"),
                              ("flow_indices", "string"),
                              ("clip_qual_left", "int32"),
                              ("clip_qual_right", "int32"),
                              ("clip_adapter_left", "int32"),
                              ("clip_adapter_right", "int32")])

    table_columns.extend([("sam_field_BC", "string"),
                          ("sam_field_XC", "int32"),
                          ("sam_field_XT", "string"),
                          ("sam_field_NM", "int32"),
                          ("sam_field_CM", "int32"),
                          ("sam_field_XN", "int32"),
                          ("sam_field_SM", "int32"),
                          ("sam_field_AM", "int32"),
                          ("sam_field_XM", "int32"),
                          ("sam_field_X0", "int32"),
                          ("sam_field_X1", "int32"),
                          ("sam_field_XG", "int32"),
                          ("sam_field_MD", "string"),
                          ("sam_field_XA", "string"),
                          ("sam_optional_fields", "string")])


    column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in table_columns]

    gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")
    t = dxpy.new_dxgtable(column_descriptors, indices=[gri_index])

    if input_ref_is_indexed:
        original_contigset = dxpy.get_details(job_inputs['reference'])['original_contigset']
    else:
        original_contigset = job_inputs['reference']
    t.set_details({'original_contigset': original_contigset})

    t.add_types(["LetterMappings", "Mappings", "gri"])

    # name table
    if 'output_name' in job_inputs:
        t.rename(job_inputs['output_name'])
    else:
        first_reads_name = dxpy.DXGTable( job_inputs['reads'][0] ).describe()['name']
        contig_set_name = dxpy.describe(job_inputs['reference'])['name']
        # if we're working on an indexed_reference we're not guaranteed to have access to original_contigset
        if input_ref_is_indexed:
            contig_set_name = contig_set_name.split(' (index')[0]
        t.rename(first_reads_name + " mapped to " + contig_set_name)

    # declare how many paired or single reads are in each reads table
    read_group_lengths = []
    for i in range(len(reads_ids)):
        current_length = reads_descriptions[reads_ids[i]]["length"]
        if 'sequence2' in dxpy.DXGTable(reads_ids[i]).get_col_names():
            num_pairs = current_length
            num_singles = 0
        else:
            num_pairs = 0
            num_singles = current_length

        read_group_lengths.append( {"num_singles":num_singles, "num_pairs":num_pairs} )

    details = t.get_details()
    details['read_groups'] = read_group_lengths
    t.set_details(details)

    row_offsets = []; row_cursor = 0
    for i in range(len(reads_ids)):
        row_offsets.append(row_cursor)
        row_cursor += reads_descriptions[reads_ids[i]]["length"]

    chunk_size = job_inputs["chunk_size"]

    map_job_inputs = job_inputs.copy()
    map_job_inputs["row_offsets"] = row_offsets
    map_job_inputs["num_rows"] = chunk_size
    map_job_inputs["table_id"] = t.get_id()
    map_job_inputs["indexed_reference"] = job_outputs['indexed_reference']

    postprocess_job_inputs = job_inputs.copy()
    postprocess_job_inputs["table_id"] = t.get_id()

    for start_row in xrange(0, row_cursor, chunk_size):
        map_job_inputs["start_row"] = start_row
        map_job = dxpy.new_dxjob(map_job_inputs, "map")
        print "Launched map job with", map_job_inputs
        postprocess_job_inputs["chunk%dresult" % start_row] = {'job': map_job.get_id(), 'field': 'ok'}
        postprocess_job_inputs["chunk%ddebug" % start_row] = {'job': map_job.get_id(), 'field': 'debug'}

    postprocess_job = dxpy.new_dxjob(postprocess_job_inputs, "postprocess")

    job_outputs['mappings'] = {'job': postprocess_job.get_id(), 'field': 'mappings'}

    print "MAIN OUTPUT:", job_outputs
    return job_outputs
Exemple #28
0
    def test_gri(self):
        data10 = [['chr2', 22, 28, 'j'],
                  ['chr1',  0,  3, 'a'],
                  ['chr1',  5,  8, 'b'],
                  ['chr1', 25, 30, 'i'],
                  ['chr1',  6, 10, 'c'],
                  ['chr1', 19, 20, 'h'],
                  ['chr1',  8,  9, 'd'],
                  ['chr1', 17, 19, 'g'],
                  ['chr1', 15, 23, 'e'],
                  ['chr1', 16, 21, 'f']];
        columns = [{ "name": 'foo', "type": 'string' },
                   { "name": 'bar', "type": 'int32' },
                   { "name": 'baz', "type": 'int32' },
                   { "name": 'quux', "type": 'string' }];
        genomic_index = dxpy.DXGTable.genomic_range_index('foo', 'bar', 'baz')
        self.assertEqual(genomic_index, {"name": "gri", "type": "genomic",
                                         "chr": "foo", "lo": "bar", "hi": "baz"})

        dxgtable = dxpy.new_dxgtable(columns, indices=[genomic_index])
        desc = dxgtable.describe()
        self.assertEqual(desc["indices"], [genomic_index]);

        dxgtable.add_rows(data10[:3], 1)
        dxgtable.add_rows(data10[3:6], 10)
        dxgtable.add_rows(data10[6:9], 100)
        dxgtable.add_rows(data10[9:], 1000)

        dxgtable.close(True)

        desc = dxgtable.describe()
        self.assertEqual(desc["length"], 10)

        # Offset + limit queries
        result = dxgtable.get_rows(starting=0, limit=1);
        self.assertEqual(result["data"],
                         [[0, 'chr1',  0,  3, 'a']]);
        self.assertEqual(result["next"], 1);
        self.assertEqual(result["length"], 1);

        result = dxgtable.get_rows(starting=4, limit=3);
        self.assertEqual(result["data"],
                         [[4, 'chr1', 15, 23, 'e'],
                          [5, 'chr1', 16, 21, 'f'],
                          [6, 'chr1', 17, 19, 'g']]);
        self.assertEqual(result["next"], 7);
        self.assertEqual(result["length"], 3);

        # Range query
        genomic_query = dxpy.DXGTable.genomic_range_query('chr1', 22, 25)
        result = dxgtable.get_rows(query=genomic_query)
        self.assertEqual(result["data"],
                         [[4, 'chr1', 15, 23, 'e']]);
        self.assertEqual(result["next"], None);
        self.assertEqual(result["length"], 1);

        # Range query with nonconsecutive rows in result
        genomic_query = dxpy.DXGTable.genomic_range_query('chr1', 20, 26)
        result = dxgtable.get_rows(query=genomic_query)
        self.assertEqual(result["data"],
                   [[4, 'chr1', 15, 23, 'e'],
                    [5, 'chr1', 16, 21, 'f'],
                    [8, 'chr1', 25, 30, 'i']]);
        self.assertEqual(result["next"], None);
        self.assertEqual(result["length"], 3);

        # Testing iterate_rows
        row_num = 5
        for row in dxgtable.iterate_rows(5, 8):
            self.assertEqual(row_num, row[0])
            row_num += 1
        self.assertEqual(row_num, 8)

        # Testing iterate_query_rows
        genomic_query = dxpy.DXGTable.genomic_range_query('chr1', 20, 26)
        result_num = 0
        for row in dxgtable.iterate_query_rows(genomic_query):
            if result_num == 0:
                self.assertEqual(4, row[0])
            elif result_num == 1:
                self.assertEqual(5, row[0])
            elif result_num == 2:
                self.assertEqual(8, row[0])
            result_num += 1
        self.assertEqual(3, result_num)
Exemple #29
0
def constructTable(inputFileName):
    inputFile = open(inputFileName, 'r')
    attributes = {}
    for line in inputFile:
        if line[0] != "#":
            line = line.strip().split("#")[0]
            tabSplit = line.split("\t")
            if len(tabSplit) == 1:
                tabSplit = line.split(" ")
                if len(tabSplit) < 9:
                    raise dxpy.AppError(
                        "One row did not have 8 or 9 entries, it had 1 instead. Offending line: "
                        + line)
                tabSplit[8] = " ".join(tabSplit[8:])
                tabSplit = tabSplit[:9]

            if len(tabSplit) != 8 and len(tabSplit) != 9:
                raise dxpy.AppError(
                    "One row did not have 8 or 9 entries, it had " +
                    str(len(tabSplit)) + " instead. Offending line: " + line)
            elif len(tabSplit) == 9:
                reg = re.findall("([^=]*)=([^;]*);", tabSplit[8].strip() + ";")
                for x in reg:
                    attributes[x[0]] = True

    reservedColumns = [
        "", "chr", "lo", "hi", "name", "span_id", "type", "score", "is_coding",
        "parent_id", "frame", "description", "source"
    ]

    #Construct table
    schema = [{
        "name": "chr",
        "type": "string"
    }, {
        "name": "lo",
        "type": "uint32"
    }, {
        "name": "hi",
        "type": "uint32"
    }, {
        "name": "name",
        "type": "string"
    }, {
        "name": "span_id",
        "type": "int32"
    }, {
        "name": "type",
        "type": "string"
    }, {
        "name": "strand",
        "type": "string"
    }, {
        "name": "score",
        "type": "float"
    }, {
        "name": "is_coding",
        "type": "boolean"
    }, {
        "name": "parent_id",
        "type": "int32"
    }, {
        "name": "frame",
        "type": "int16"
    }, {
        "name": "description",
        "type": "string"
    }, {
        "name": "source",
        "type": "string"
    }]

    additionalColumns = []
    for k, v in attributes.iteritems():
        if k not in reservedColumns and len(k) < 100:
            schema.append({"name": k, "type": "string"})
            additionalColumns.append(k)

    indices = [
        dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", 'gri'),
        dxpy.DXGTable.lexicographic_index([
            dxpy.DXGTable.lexicographic_index_column("name", True, False),
            dxpy.DXGTable.lexicographic_index_column("chr"),
            dxpy.DXGTable.lexicographic_index_column("lo"),
            dxpy.DXGTable.lexicographic_index_column("hi"),
            dxpy.DXGTable.lexicographic_index_column("type")
        ], "search")
    ]
    spansTable = dxpy.new_dxgtable(columns=schema, indices=indices)
    return spansTable, additionalColumns
Exemple #30
0
    def test_var_initialization(self):
        '''
        This test assumes a well-formed input spec and mostly just
        tests that everything compiles and the variable initialization
        code does not throw any errors.
        '''

        print("Setting current project to", self.project)
        dxpy.WORKSPACE_ID = self.project
        dxpy.PROJECT_CONTEXT_ID = self.project

        # Make some data objects for input
        dxapplet = dxpy.api.applet_new({
            "project": dxpy.WORKSPACE_ID,
            "name": "anapplet",
            "dxapi": "1.0.0",
            "runSpec": {
                "code": "",
                "interpreter": "bash"
            }
        })['id']
        dxfile = dxpy.upload_string("foo", name="afile")
        dxgtable = dxpy.new_dxgtable(columns=[{
            "name": "int_col",
            "type": "int"
        }],
                                     name="agtable")
        dxgtable.add_rows([[3], [0]])
        dxgtable.close(block=True)
        dxrecord = dxpy.new_dxrecord(name="arecord")
        dxrecord.close()

        dxapp_json = {
            "name": "all_vars",
            "title": "all_vars",
            "summary": "all_vars",
            "dxapi": "1.0.0",
            "version": "0.0.1",
            "categories": [],
            "inputSpec": [],
            "outputSpec": []
        }

        classes = [
            'applet', 'record', 'file', 'gtable', 'boolean', 'int', 'float',
            'string', 'hash', 'array:applet', 'array:record', 'array:file',
            'array:gtable', 'array:boolean', 'array:int', 'array:float',
            'array:string'
        ]

        for classname in classes:
            dxapp_json['inputSpec'].append({
                "name":
                "required_" + classname.replace(":", "_"),
                "class":
                classname,
                "optional":
                False
            })
            # Note: marking outputs as optional so that empty arrays
            # will be acceptable; keeping names the same (as required)
            # in order to allow pass-through from input variables
            dxapp_json['outputSpec'].append({
                "name":
                "required_" + classname.replace(":", "_"),
                "class":
                classname,
                "optional":
                True
            })
            dxapp_json['inputSpec'].append({
                "name":
                "optional_" + classname.replace(":", "_"),
                "class":
                classname,
                "optional":
                True
            })

        cmdline_args = [
            '-irequired_applet=anapplet', '-irequired_array_applet=anapplet',
            '-irequired_record=arecord', '-irequired_array_record=arecord',
            '-irequired_file=afile', '-irequired_array_file=afile',
            '-irequired_gtable=agtable', '-irequired_array_gtable=agtable',
            '-irequired_boolean=true', '-irequired_array_boolean=true',
            '-irequired_int=32', '-irequired_array_int=42',
            '-irequired_float=3.4', '-irequired_array_float=.42',
            '-irequired_string=foo', '-irequired_array_string=bar',
            '-irequired_hash={"foo":"bar"}'
        ]
        for lang in supported_languages:
            appdir = create_app_dir_with_dxapp_json(dxapp_json, lang)
            # Test with bare-minimum of inputs
            output = subprocess.check_output(['dx-run-app-locally', appdir] +
                                             cmdline_args)
            print(output)
            self.assertIn("App finished successfully", output)

            # See PTFM-13697 for CentOS 5 details
            if testutil.TEST_RUN_JOBS and not testutil.host_is_centos_5():
                # Now actually make it an applet and run it
                applet_name = dxapp_json['name'] + '-' + lang
                subprocess.check_output(
                    ['dx', 'build', appdir, '--destination', applet_name])
                subprocess.check_output(
                    ['dx', 'run', applet_name, '-y', '--wait'] + cmdline_args)
def import_genes(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, delimiter="\t"):
    # implement BED importing from this format:
    # http://genome.ucsc.edu/FAQ/FAQformat.html#format1

    columns = [("chr", "string"),
               ("lo", "int32"),
               ("hi", "int32"),
               ("name", "string"),
               ("span_id", "int32"),
               ("type", "string"),
               ("strand", "string"),
               ("is_coding", "boolean"),
               ("parent_id", "int32"),
               ("frame", "int16"),
               ("description", "string")]

    column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns]
    
    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), 
               dxpy.DXGTable.lexicographic_index([
                  dxpy.DXGTable.lexicographic_index_column("name", True, False),
                  dxpy.DXGTable.lexicographic_index_column("chr"),
                  dxpy.DXGTable.lexicographic_index_column("lo"),
                  dxpy.DXGTable.lexicographic_index_column("hi"),
                  dxpy.DXGTable.lexicographic_index_column("type")], "search")]

    default_row = ["", 0, 0, "", -1, "", ".", False, -1, -1, ""]

    with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span:
        span_table_id = span.get_id()

        details = {"original_contigset": dxpy.dxlink(ref_id)}
        if file_id != None:
            details["original_file"] = dxpy.dxlink(file_id)
        if len(property_keys) != len(property_values):
            raise dxpy.AppError("Expected each provided property to have a corresponding value.")
        for i in range(len(property_keys)):
            details[property_keys[i]] = property_values[i]
        span.set_details(details)

        span.add_types(["gri", "Genes"])
        span.rename(table_name)

        current_span_id = 0

        # where the parsing magic happens
        for line in bed:
            if line.startswith("track"):
                details = span.get_details()
                details['track'] = line
                span.set_details(details)
                continue
            line = line.rstrip("\n")
            row = list(default_row)
            line = line.split(delimiter)
            validate_line(line)
            if len(line) < 12:
                raise dxpy.AppError("Line: "+"\t".join(line)+" in gene model-like BED file contains less than 12 columns.  Invalid BED file.")

            # add parent gene track
            row = generate_gene_row(line, 0, 0, "transcript", default_row, -1, current_span_id)
            if row != None:
                span.add_row(row)
                current_parent_id = current_span_id
                current_span_id += 1          
                
                # add all children
                blockCount = int(line[9])
                line[10] = line[10].rstrip(",").split(",")
                blockSizes = [int(line[10][n]) for n in range(blockCount)]
                line[11] = line[11].rstrip(",").split(",")
                blockStarts = [int(line[11][n]) for n in range(blockCount)]

                gene_lo = int(line[1])
                gene_hi = int(line[2])

                # set thick* to be within the gene if outside
                thickStart = min(max(int(line[6]), gene_lo), gene_hi)
                thickEnd = max(min(int(line[7]), gene_hi), gene_lo)
                
                for i in range(blockCount):
                    # look to thickStart and thickEnd to get information about the type of this region
                    # if thick* are the same or cover the whole transcript then we ignore them
                    # else, we partition the exons into CDS and UTR based on their boundaries
                    if thickStart == thickEnd or (thickStart == gene_lo and thickEnd == gene_hi):
                        span.add_row(generate_gene_row(line, 
                                                       blockSizes[i], 
                                                       blockStarts[i], 
                                                       "exon", 
                                                       default_row, 
                                                       current_parent_id, 
                                                       current_span_id))
                        current_span_id += 1
                    else:
                        exon_lo = int(line[1])+blockStarts[i]
                        exon_hi = int(exon_lo+blockSizes[i])

                        # we're all UTR if we enter either of these
                        if (exon_hi <= thickStart and line[5] == '+') or (exon_lo >= thickEnd and line[5] == '-'):
                            span.add_row(generate_gene_row(line, 
                                                           blockSizes[i], 
                                                           blockStarts[i], 
                                                           "5' UTR", 
                                                           default_row, 
                                                           current_parent_id, 
                                                           current_span_id))
                            current_span_id += 1
                        elif (exon_hi <= thickStart and line[5] == '-') or (exon_lo >= thickEnd and line[5] == '+'):
                            span.add_row(generate_gene_row(line, 
                                                           blockSizes[i], 
                                                           blockStarts[i], 
                                                           "3' UTR", 
                                                           default_row, 
                                                           current_parent_id, 
                                                           current_span_id))
                            current_span_id += 1

                        # if this is true then we overlap CDS partially or completely
                        elif (exon_lo < thickEnd and exon_hi > thickStart):
                            # entirely contained
                            if exon_lo >= thickStart and exon_hi <= thickEnd:
                                span.add_row(generate_gene_row(line, 
                                                               blockSizes[i], 
                                                               blockStarts[i], 
                                                               "CDS", 
                                                               default_row, 
                                                               current_parent_id, 
                                                               current_span_id))
                                current_span_id += 1
                            else:
                                # left portion is UTR
                                if exon_lo < thickStart:
                                    if line[5] == '+':
                                        UTR_type = "5' UTR"
                                    else:
                                        UTR_type = "3' UTR"
                                    UTR_size = (min(blockSizes[i], thickStart - exon_lo))
                                    span.add_row(generate_gene_row(line, 
                                                                   UTR_size, 
                                                                   blockStarts[i], 
                                                                   UTR_type,
                                                                   default_row, 
                                                                   current_parent_id, 
                                                                   current_span_id))
                                    current_span_id += 1

                                # CDS portion
                                CDS_size = blockSizes[i] - (max(exon_lo, thickStart) - exon_lo)
                                CDS_size -= (exon_hi - min(exon_hi, thickEnd))
                                CDS_start = (max(exon_lo, thickStart) - exon_lo) + blockStarts[i]
                                span.add_row(generate_gene_row(line, 
                                                               CDS_size, 
                                                               CDS_start, 
                                                               "CDS",
                                                               default_row, 
                                                               current_parent_id, 
                                                               current_span_id))
                                current_span_id += 1

                                # right portion is UTR
                                if exon_hi > thickEnd:
                                    if line[5] == '+':
                                        UTR_type = "3' UTR"
                                    else:
                                        UTR_type = "5' UTR"
                                    UTR_size = (min(blockSizes[i], exon_hi - thickEnd))
                                    UTR_start = blockStarts[i] + thickEnd - exon_lo
                                    span.add_row(generate_gene_row(line, 
                                                                   UTR_size, 
                                                                   UTR_start, 
                                                                   UTR_type,
                                                                   default_row, 
                                                                   current_parent_id, 
                                                                   current_span_id))
                                    current_span_id += 1

    return dxpy.dxlink(span.get_id())
Exemple #32
0
def import_named_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags):
    num_cols = find_num_columns(bed_file)
    
    possible_columns = [("chr", "string"),
                        ("lo", "int32"),
                        ("hi", "int32"),
                        ("name", "string"),
                        ("score", "float"),
                        ("strand", "string"),
                        ("thick_start", "int32"),
                        ("thick_end", "int32"),
                        ("item_rgb", "string")]

    possible_default_row = ["", 0, 0, "", 0, ".", 0, 0, ""]

    columns = possible_columns[:num_cols]

    if num_cols > len(columns):
        for i in range(len(columns), num_cols):
            columns.append(("BED_column_"+str(i+1), "string"))
            possible_default_row.append("")

    default_row = possible_default_row[:num_cols]

    column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns]
    gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")
    name_index = dxpy.DXGTable.lexicographic_index([["name", "ASC"]], "name")
    with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=[gri_index, name_index], mode='w') as span:
        details = {"original_contigset": dxpy.dxlink(ref_id)}
        if file_id != None:
            details["original_file"] = dxpy.dxlink(file_id)
        if len(property_keys) != len(property_values):
            raise dxpy.AppError("Expected each provided property to have a corresponding value.")
        for i in range(len(property_keys)):
            details[property_keys[i]] = property_values[i]    
    
        span.set_details(details)

        span.add_types(["Spans", "gri"])
        span.rename(table_name)

        for line in bed:
            row = list(default_row)

            if line.startswith("track"):
                details = span.get_details()
                details['track'] = line
                span.set_details(details)
                continue
            line = line.rstrip("\n")
            line = line.split()
            # check to see if this is a weird line
            if len(line) == 0:
                break
            if len(line) < 3:
                raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns.  Invalid BED file.")

            try:
                row[0] = line[0]
                row[1] = int(line[1])
                row[2] = int(line[2])
                row[3] = line[3]
                # dashes are sometimes used when field is invalid
                if line[4] == "-":
                    line[4] = 0
                row[4] = int(line[4])
                row[5] = line[5]
                # dashes are sometimes used when field is invalid
                if line[6] == "-":
                    line[6] = 0
                row[6] = int(line[6])
                # dashes are sometimes used when field is invalid
                if line[7] == "-":
                    line[7] = 0
                row[7] = int(line[7])
                row[8] = line[8]

            # an index error would come from having fewer columns in a row, which we should handle ok
            except IndexError:
                pass
            # value error when fields are messed up and string gets converted to int, etc.  Throw these out.
            except ValueError:
                continue
            
            span.add_row(row)

    return dxpy.dxlink(span.get_id())
def import_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, isBedDetail, delimiter="\t"):
    num_cols = find_num_columns(bed_file, delimiter)

    # if this is a bedDetail file we should treat the last two columns separately
    if isBedDetail:
        num_cols -= 2
    
    possible_columns = [("chr", "string"),
                        ("lo", "int32"),
                        ("hi", "int32"),
                        ("name", "string"),
                        ("score", "float"),
                        ("strand", "string"),
                        ("thick_start", "int32"),
                        ("thick_end", "int32"),
                        ("item_rgb", "string")]

    bedDetail_columns = [("bedDetail_ID", "string"),
                         ("bedDetail_desc", "string")]

    possible_default_row = ["", 0, 0, "", 0, ".", 0, 0, ""]

    columns = possible_columns[:num_cols]

    if isBedDetail:
        columns.extend(bedDetail_columns)

    if num_cols > len(columns):
        for i in range(len(columns), num_cols):
            columns.append(("BED_column_"+str(i+1), "string"))
            possible_default_row.append("")

    default_row = possible_default_row[:num_cols]

    if isBedDetail:
        default_row.extend(["",""])

    column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns]
    
    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')]
    for c in columns:
        if "name" in c:
            indices.append(dxpy.DXGTable.lexicographic_index([
                              dxpy.DXGTable.lexicographic_index_column("name", True, False),
                              dxpy.DXGTable.lexicographic_index_column("chr"),
                              dxpy.DXGTable.lexicographic_index_column("lo"),
                              dxpy.DXGTable.lexicographic_index_column("hi")], "search"))
            break
            
    with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span:
        details = {"original_contigset": dxpy.dxlink(ref_id)}
        if file_id != None:
            details["original_file"] = dxpy.dxlink(file_id)
        if len(property_keys) != len(property_values):
            raise dxpy.AppError("Expected each provided property to have a corresponding value.")
        for i in range(len(property_keys)):
            details[property_keys[i]] = property_values[i]    
    
        span.set_details(details)

        span.add_types(["Spans", "gri"])
        span.rename(table_name)

        for line in bed:
            row = list(default_row)

            if line.startswith("track"):
                details = span.get_details()
                details['track'] = line
                span.set_details(details)
                continue
            line = line.rstrip("\n")
            line = line.split(delimiter)
            if isBedDetail:
                # only the first 4 columns are guaranteed to be defined by UCSC
                validate_line(line[:4])
                # save last two fields separately
                bedDetailFields = line[-2:]
                line = line[:-2]     
            else:        
                validate_line(line[:num_cols])
            
            # check to see if this is a weird line
            if len(line) == 0:
                break
            if len(line) < 3:
                raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns.  Invalid BED file.")

            try:
                row[0] = line[0]
                row[1] = int(line[1])
                row[2] = int(line[2])
                row[3] = line[3]
                # dashes are sometimes used when field is invalid
                if line[4] == "-" or line[4] == ".":
                    line[4] = 0
                row[4] = float(line[4])
                row[5] = line[5]
                # dashes are sometimes used when field is invalid
                if line[6] == "-" or line[6] == ".":
                    line[6] = 0
                row[6] = int(line[6])
                # dashes are sometimes used when field is invalid
                if line[7] == "-" or line[7] == ".":
                    line[7] = 0
                row[7] = int(line[7])
                row[8] = line[8]

            # an index error would come from having fewer columns in a row, which we should handle ok
            except IndexError:
                pass
            # value error when fields are messed up and string gets converted to int, etc.  Throw these out.
            except ValueError:
                continue
            
            if isBedDetail:
                # add these in at the end if we have a bedDetail file
                row[num_cols] = bedDetailFields[0]
                row[num_cols+1] = bedDetailFields[1]
            
            span.add_row(row)

        span.flush()

    return dxpy.dxlink(span.get_id())
Exemple #34
0
 def test_table_context_manager_destructor(self):
     dxgtable = dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                                   dxpy.DXGTable.make_column_desc("b", "int32")])
     for i in range(64):
         dxgtable.add_rows(data=[["row"+str(i), i]])
def import_genes(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, delimiter="\t"):
    # implement BED importing from this format:
    # http://genome.ucsc.edu/FAQ/FAQformat.html#format1

    columns = [("chr", "string"),
               ("lo", "int32"),
               ("hi", "int32"),
               ("name", "string"),
               ("span_id", "int32"),
               ("type", "string"),
               ("strand", "string"),
               ("is_coding", "boolean"),
               ("parent_id", "int32"),
               ("frame", "int16"),
               ("description", "string")]

    column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns]
    
    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), 
               dxpy.DXGTable.lexicographic_index([
                  dxpy.DXGTable.lexicographic_index_column("name", True, False),
                  dxpy.DXGTable.lexicographic_index_column("chr"),
                  dxpy.DXGTable.lexicographic_index_column("lo"),
                  dxpy.DXGTable.lexicographic_index_column("hi"),
                  dxpy.DXGTable.lexicographic_index_column("type")], "search")]

    default_row = ["", 0, 0, "", -1, "", ".", False, -1, -1, ""]

    with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span:
        span_table_id = span.get_id()

        details = {"original_contigset": dxpy.dxlink(ref_id)}
        if file_id != None:
            details["original_file"] = dxpy.dxlink(file_id)
        if len(property_keys) != len(property_values):
            raise dxpy.AppError("Expected each provided property to have a corresponding value.")
        for i in range(len(property_keys)):
            details[property_keys[i]] = property_values[i]
        span.set_details(details)

        span.add_types(["gri", "Genes"])
        span.rename(table_name)

        current_span_id = 0

        # where the parsing magic happens
        for line in bed:
            if line.startswith("track"):
                details = span.get_details()
                details['track'] = line
                span.set_details(details)
                continue
            line = line.rstrip("\n")
            row = list(default_row)
            line = line.split(delimiter)
            validate_line(line)
            if len(line) < 12:
                raise dxpy.AppError("Line: "+"\t".join(line)+" in gene model-like BED file contains less than 12 columns.  Invalid BED file.")

            # add parent gene track
            row = generate_gene_row(line, 0, 0, "transcript", default_row, -1, current_span_id)
            if row != None:
                span.add_row(row)
                current_parent_id = current_span_id
                current_span_id += 1          
                
                # add all children
                blockCount = int(line[9])
                line[10] = line[10].rstrip(",").split(",")
                blockSizes = [int(line[10][n]) for n in range(blockCount)]
                line[11] = line[11].rstrip(",").split(",")
                blockStarts = [int(line[11][n]) for n in range(blockCount)]

                gene_lo = int(line[1])
                gene_hi = int(line[2])

                # set thick* to be within the gene if outside
                thickStart = min(max(int(line[6]), gene_lo), gene_hi)
                thickEnd = max(min(int(line[7]), gene_hi), gene_lo)
                
                for i in range(blockCount):
                    # look to thickStart and thickEnd to get information about the type of this region
                    # if thick* are the same or cover the whole transcript then we ignore them
                    # else, we partition the exons into CDS and UTR based on their boundaries
                    if thickStart == thickEnd or (thickStart == gene_lo and thickEnd == gene_hi):
                        span.add_row(generate_gene_row(line, 
                                                       blockSizes[i], 
                                                       blockStarts[i], 
                                                       "exon", 
                                                       default_row, 
                                                       current_parent_id, 
                                                       current_span_id))
                        current_span_id += 1
                    else:
                        exon_lo = int(line[1])+blockStarts[i]
                        exon_hi = int(exon_lo+blockSizes[i])

                        # we're all UTR if we enter either of these
                        if (exon_hi <= thickStart and line[5] == '+') or (exon_lo >= thickEnd and line[5] == '-'):
                            span.add_row(generate_gene_row(line, 
                                                           blockSizes[i], 
                                                           blockStarts[i], 
                                                           "5' UTR", 
                                                           default_row, 
                                                           current_parent_id, 
                                                           current_span_id))
                            current_span_id += 1
                        elif (exon_hi <= thickStart and line[5] == '-') or (exon_lo >= thickEnd and line[5] == '+'):
                            span.add_row(generate_gene_row(line, 
                                                           blockSizes[i], 
                                                           blockStarts[i], 
                                                           "3' UTR", 
                                                           default_row, 
                                                           current_parent_id, 
                                                           current_span_id))
                            current_span_id += 1

                        # if this is true then we overlap CDS partially or completely
                        elif (exon_lo < thickEnd and exon_hi > thickStart):
                            # entirely contained
                            if exon_lo >= thickStart and exon_hi <= thickEnd:
                                span.add_row(generate_gene_row(line, 
                                                               blockSizes[i], 
                                                               blockStarts[i], 
                                                               "CDS", 
                                                               default_row, 
                                                               current_parent_id, 
                                                               current_span_id))
                                current_span_id += 1
                            else:
                                # left portion is UTR
                                if exon_lo < thickStart:
                                    if line[5] == '+':
                                        UTR_type = "5' UTR"
                                    else:
                                        UTR_type = "3' UTR"
                                    UTR_size = (min(blockSizes[i], thickStart - exon_lo))
                                    span.add_row(generate_gene_row(line, 
                                                                   UTR_size, 
                                                                   blockStarts[i], 
                                                                   UTR_type,
                                                                   default_row, 
                                                                   current_parent_id, 
                                                                   current_span_id))
                                    current_span_id += 1

                                # CDS portion
                                CDS_size = blockSizes[i] - (max(exon_lo, thickStart) - exon_lo)
                                CDS_size -= (exon_hi - min(exon_hi, thickEnd))
                                CDS_start = (max(exon_lo, thickStart) - exon_lo) + blockStarts[i]
                                span.add_row(generate_gene_row(line, 
                                                               CDS_size, 
                                                               CDS_start, 
                                                               "CDS",
                                                               default_row, 
                                                               current_parent_id, 
                                                               current_span_id))
                                current_span_id += 1

                                # right portion is UTR
                                if exon_hi > thickEnd:
                                    if line[5] == '+':
                                        UTR_type = "3' UTR"
                                    else:
                                        UTR_type = "5' UTR"
                                    UTR_size = (min(blockSizes[i], exon_hi - thickEnd))
                                    UTR_start = blockStarts[i] + thickEnd - exon_lo
                                    span.add_row(generate_gene_row(line, 
                                                                   UTR_size, 
                                                                   UTR_start, 
                                                                   UTR_type,
                                                                   default_row, 
                                                                   current_parent_id, 
                                                                   current_span_id))
                                    current_span_id += 1

    return dxpy.dxlink(span.get_id())
Exemple #36
0
 def test_create_table_with_invalid_spec(self):
     with self.assertRaises(DXAPIError):
         dxpy.new_dxgtable([dxpy.DXGTable.make_column_desc("a", "string"),
                           dxpy.DXGTable.make_column_desc("b", "muffins")])
def constructTable(inputFileName):
    inputFile = open(inputFileName, 'r')
    attributes = {"gene_id" : True, "transcript_id": True}
    for line in inputFile:
        if line[0] != "#":
            tabSplit = line.split("\t")
            if len(tabSplit) == 1:
                tabSplit = line.split(" ")
                if len(tabSplit) < 9:
                    raise dxpy.AppError("One row did not have 9 entries, it had 1 instead. Offending line: " + line)
                tabSplit[8] = " ".join(tabSplit[8:])
                tabSplit = tabSplit[:9]

            if len(tabSplit) != 9:
                raise dxpy.AppError("One row did not have 9 entries, it had " + str(len(tabSplit)) + " instead. Offending line: " + line)
            else:
                entrySplit = tabSplit[8].split(";")
                geneIdPresent = False
                transcriptIdPresent = False
                result = []
                for x in entrySplit:
                    keyValue = x.strip().split(" ")
                    key = keyValue[0]
                    if key == "gene_id":
                        geneIdPresent = True
                    elif key == "transcript_id":
                        transcriptIdPresent = True
                    attributes[key] = True
            if not geneIdPresent:
                raise dxpy.AppError("One row did not have a gene_id Offending line: " + line)
            if not transcriptIdPresent:
                raise dxpy.AppError("One row did not have a gene_id Offending line: " + line)


    #Construct table
    schema = [
            {"name": "chr", "type": "string"},
            {"name": "lo", "type": "uint32"},
            {"name": "hi", "type": "uint32"},
            {"name": "name", "type": "string"},
            {"name": "span_id", "type": "int32"},
            {"name": "type", "type": "string"},
            {"name": "strand", "type": "string"},
            {"name": "score", "type": "float"},
            {"name": "is_coding", "type": "boolean"},
            {"name": "parent_id", "type": "int32"},
            {"name": "frame", "type": "int16"},
            {"name": "description", "type": "string"},
            {"name": "source", "type": "string"},
            {"name": "gene_id", "type": "string"},
            {"name": "transcript_id", "type": "string"}]

    additionalColumns = ['gene_id', 'transcript_id']
    for k, v in attributes.iteritems():
        if k != '' and k != 'gene_id' and k != 'transcript_id' and len(k) < 100:
            schema.append({"name": k, "type": "string"})
            additionalColumns.append(k)

    indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), 
               dxpy.DXGTable.lexicographic_index([
                  dxpy.DXGTable.lexicographic_index_column("name", True, False),
                  dxpy.DXGTable.lexicographic_index_column("chr"),
                  dxpy.DXGTable.lexicographic_index_column("lo"),
                  dxpy.DXGTable.lexicographic_index_column("hi"),
                  dxpy.DXGTable.lexicographic_index_column("type")], "search")]
    spansTable = dxpy.new_dxgtable(columns=schema, indices=indices)
    return spansTable, additionalColumns
def constructTable(inputFileName):
    inputFile = open(inputFileName, 'r')
    attributes = {"gene_id": True, "transcript_id": True}
    for line in inputFile:
        if line[0] != "#":
            tabSplit = line.split("\t")
            if len(tabSplit) == 1:
                tabSplit = line.split(" ")
                if len(tabSplit) < 9:
                    raise dxpy.AppError(
                        "One row did not have 9 entries, it had 1 instead. Offending line: "
                        + line)
                tabSplit[8] = " ".join(tabSplit[8:])
                tabSplit = tabSplit[:9]

            if len(tabSplit) != 9:
                raise dxpy.AppError("One row did not have 9 entries, it had " +
                                    str(len(tabSplit)) +
                                    " instead. Offending line: " + line)
            else:
                entrySplit = tabSplit[8].split(";")
                geneIdPresent = False
                transcriptIdPresent = False
                result = []
                for x in entrySplit:
                    keyValue = x.strip().split(" ")
                    key = keyValue[0]
                    if key == "gene_id":
                        geneIdPresent = True
                    elif key == "transcript_id":
                        transcriptIdPresent = True
                    attributes[key] = True
            if not geneIdPresent:
                raise dxpy.AppError(
                    "One row did not have a gene_id Offending line: " + line)
            if not transcriptIdPresent:
                raise dxpy.AppError(
                    "One row did not have a gene_id Offending line: " + line)

    #Construct table
    schema = [{
        "name": "chr",
        "type": "string"
    }, {
        "name": "lo",
        "type": "uint32"
    }, {
        "name": "hi",
        "type": "uint32"
    }, {
        "name": "name",
        "type": "string"
    }, {
        "name": "span_id",
        "type": "int32"
    }, {
        "name": "type",
        "type": "string"
    }, {
        "name": "strand",
        "type": "string"
    }, {
        "name": "score",
        "type": "float"
    }, {
        "name": "is_coding",
        "type": "boolean"
    }, {
        "name": "parent_id",
        "type": "int32"
    }, {
        "name": "frame",
        "type": "int16"
    }, {
        "name": "description",
        "type": "string"
    }, {
        "name": "source",
        "type": "string"
    }, {
        "name": "gene_id",
        "type": "string"
    }, {
        "name": "transcript_id",
        "type": "string"
    }]

    additionalColumns = ['gene_id', 'transcript_id']
    for k, v in attributes.iteritems():
        if k != '' and k != 'gene_id' and k != 'transcript_id' and len(
                k) < 100:
            schema.append({"name": k, "type": "string"})
            additionalColumns.append(k)

    indices = [
        dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", 'gri'),
        dxpy.DXGTable.lexicographic_index([
            dxpy.DXGTable.lexicographic_index_column("name", True, False),
            dxpy.DXGTable.lexicographic_index_column("chr"),
            dxpy.DXGTable.lexicographic_index_column("lo"),
            dxpy.DXGTable.lexicographic_index_column("hi"),
            dxpy.DXGTable.lexicographic_index_column("type")
        ], "search")
    ]
    spansTable = dxpy.new_dxgtable(columns=schema, indices=indices)
    return spansTable, additionalColumns
def upload_transcripts_file( trans_file, sample_name ):
    with open(trans_file, 'r') as fh:
        # eat column header line
        line = fh.readline().rstrip('\n')

        line = line.split('\t')

        trans_schema = [("chr", "string"),
                        ("lo", "int32"),
                        ("hi", "int32"),
                        ("tracking_id", "string"),
                        ("class_code", "string"),
                        ("nearest_ref_id", "string"),
                        ("gene_id", "string"),
                        ("gene_short_name", "string"),
                        ("tss_id", "string"),
                        ("length", "int32"),
                        ("coverage", "float"),
                        ("FPKM", "float"),
                        ("FPKM_lo", "float"),
                        ("FPKM_hi", "float"),
                        ("status", "string")]

        column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in trans_schema]

        gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")
        transcripts = dxpy.new_dxgtable(column_descriptors, indices=[gri_index])
        transcripts.rename(sample_name+"_FPKM_per_gene")

        while True:
            line = fh.readline()
            line = line.rstrip('\n')
            if line == '':
                break

            line = line.split('\t')

            try:
                chrom = line[6].split(":")[0]
                lo = int(line[6].split(":")[1].split("-")[0]) - 1
                hi = int(line[6].split(":")[1].split("-")[1])
                # no length set, set to 0
                if line[7] == '-':
                    line[7] = 0
                if line[8] == '-':
                    line[8] = -1

                trans_row = [chrom, lo, hi,
                             line[0],
                             line[1],
                             line[2],
                             line[3],
                             line[4],
                             line[5],
                             int(line[7]),
                             float(line[8]),
                             float(line[9]),
                             float(line[10]),
                             float(line[11]),
                             line[12]]

                transcripts.add_row(trans_row)
            except IndexError:
                raise dxpy.AppError("Error parsing transcript file from cufflinks.  Line: "+line)

    transcripts.close(block = True)

    return transcripts
    def test_var_initialization(self):
        '''
        This test assumes a well-formed input spec and mostly just
        tests that everything compiles and the variable initialization
        code does not throw any errors.
        '''

        print("Setting current project to", self.project)
        dxpy.WORKSPACE_ID = self.project
        dxpy.PROJECT_CONTEXT_ID = self.project

        # Make some data objects for input
        dxapplet = dxpy.api.applet_new({"project": dxpy.WORKSPACE_ID,
                                        "name": "anapplet",
                                        "dxapi": "1.0.0",
                                        "runSpec": {"code": "", "interpreter": "bash"}})['id']
        dxfile = dxpy.upload_string("foo", name="afile")
        dxgtable = dxpy.new_dxgtable(columns=[{"name": "int_col", "type": "int"}], name="agtable")
        dxgtable.add_rows([[3], [0]])
        dxgtable.close(block=True)
        dxrecord = dxpy.new_dxrecord(name="arecord")
        dxrecord.close()

        dxapp_json = {
            "name": "all_vars",
            "title": "all_vars",
            "summary": "all_vars",
            "dxapi": "1.0.0",
            "version": "0.0.1",
            "categories": [],
            "inputSpec": [],
            "outputSpec": []
        }

        classes = ['applet', 'record', 'file', 'gtable',
                   'boolean', 'int', 'float', 'string', 'hash',
                   'array:applet', 'array:record', 'array:file', 'array:gtable',
                   'array:boolean', 'array:int', 'array:float', 'array:string']

        for classname in classes:
            dxapp_json['inputSpec'].append({"name": "required_" + classname.replace(":", "_"),
                                            "class": classname,
                                            "optional": False})
            # Note: marking outputs as optional so that empty arrays
            # will be acceptable; keeping names the same (as required)
            # in order to allow pass-through from input variables
            dxapp_json['outputSpec'].append({"name": "required_" + classname.replace(":", "_"),
                                             "class": classname,
                                             "optional": True})
            dxapp_json['inputSpec'].append({"name": "optional_" + classname.replace(":", "_"),
                                            "class": classname,
                                            "optional": True})

        cmdline_args = ['-irequired_applet=anapplet',
                        '-irequired_array_applet=anapplet',
                        '-irequired_record=arecord',
                        '-irequired_array_record=arecord',
                        '-irequired_file=afile',
                        '-irequired_array_file=afile',
                        '-irequired_gtable=agtable',
                        '-irequired_array_gtable=agtable',
                        '-irequired_boolean=true',
                        '-irequired_array_boolean=true',
                        '-irequired_array_boolean=false',
                        '-irequired_int=32',
                        '-irequired_array_int=42',
                        '-irequired_float=3.4',
                        '-irequired_array_float=.42',
                        '-irequired_string=foo',
                        '-irequired_array_string=bar',
                        '-irequired_hash={"foo":"bar"}']
        for lang in supported_languages:
            appdir = create_app_dir_with_dxapp_json(dxapp_json, lang)
            # Test with bare-minimum of inputs
            output = subprocess.check_output(['dx-run-app-locally', appdir] + cmdline_args)
            print(output)
            # Verify array is printed total 3 times once in each input, logs, and final output
            self.assertEquals(len(re.findall("required_array_boolean = \[ true, false ]", output)), 3)
            self.assertIn("App finished successfully", output)

            # See PTFM-13697 for CentOS 5 details
            if testutil.TEST_RUN_JOBS and not testutil.host_is_centos_5():
                # Now actually make it an applet and run it
                applet_name = dxapp_json['name'] + '-' + lang
                subprocess.check_output(['dx', 'build', appdir, '--destination', applet_name])
                subprocess.check_output(['dx', 'run', applet_name, '-y', '--wait'] + cmdline_args)
def upload_transcripts_file(trans_file, sample_name):
    with open(trans_file, 'r') as fh:
        # eat column header line
        line = fh.readline().rstrip('\n')

        line = line.split('\t')

        trans_schema = [("chr", "string"), ("lo", "int32"), ("hi", "int32"),
                        ("tracking_id", "string"), ("class_code", "string"),
                        ("nearest_ref_id", "string"), ("gene_id", "string"),
                        ("gene_short_name", "string"), ("tss_id", "string"),
                        ("length", "int32"), ("coverage", "float"),
                        ("FPKM", "float"), ("FPKM_lo", "float"),
                        ("FPKM_hi", "float"), ("status", "string")]

        column_descriptors = [
            dxpy.DXGTable.make_column_desc(name, type)
            for name, type in trans_schema
        ]

        gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi")
        transcripts = dxpy.new_dxgtable(column_descriptors,
                                        indices=[gri_index])
        transcripts.rename(sample_name + "_FPKM_per_gene")

        while True:
            line = fh.readline()
            line = line.rstrip('\n')
            if line == '':
                break

            line = line.split('\t')

            try:
                chrom = line[6].split(":")[0]
                lo = int(line[6].split(":")[1].split("-")[0]) - 1
                hi = int(line[6].split(":")[1].split("-")[1])
                # no length set, set to 0
                if line[7] == '-':
                    line[7] = 0
                if line[8] == '-':
                    line[8] = -1

                trans_row = [
                    chrom, lo, hi, line[0], line[1], line[2], line[3], line[4],
                    line[5],
                    int(line[7]),
                    float(line[8]),
                    float(line[9]),
                    float(line[10]),
                    float(line[11]), line[12]
                ]

                transcripts.add_row(trans_row)
            except IndexError:
                raise dxpy.AppError(
                    "Error parsing transcript file from cufflinks.  Line: " +
                    line)

    transcripts.close(block=True)

    return transcripts