Ejemplo n.º 1
0
def create_or_get_subtable(table, subtable_key, size):
    subtable = get_in(table, subtable_key)
    if subtable is None:
        subtable = dict(READGROUP_TEMPLATE)
        subtable["Size"] = size
        set_in(table, subtable_key, subtable)
    return subtable
Ejemplo n.º 2
0
def update_gtf_table(table, gtf, scaffolds, contig_prefix):
    # Workaround for bug in Pysam, which mis-parses individual properties
    # (e.g. exon_number) if these are not quoted. This does not apply to
    # asDict, which uses a different parsing implementation (v0.7.8).
    properties = gtf.asDict()

    gene_type = properties.get("gene_biotype")
    if gene_type is None:
        gene_type = properties.get("gene_type", "unknown_genetype")

    keys = (gene_type, properties["gene_id"], properties["transcript_id"],
            int(properties["exon_number"]), gtf.feature)

    record = {
        "contig": contig_prefix + gtf.contig,
        "start": gtf.start,
        # In pysam, 'end' equals the past-the-end position
        "end": gtf.end - 1,
        "strand": gtf.strand,
        "feature": gtf.feature,
        "transcript": properties["transcript_id"]
    }

    if record["contig"] in scaffolds:
        contig = scaffolds[record["contig"]]
        record["contig"] = contig["chrom"]
        record["start"] += int(contig["chromStart"])
        record["end"] += int(contig["chromStart"])

    assert not get_in(table, keys), keys
    set_in(table, keys, record)
Ejemplo n.º 3
0
def update_gtf_table(table, gtf, scaffolds, contig_prefix):
    # Workaround for bug in Pysam, which mis-parses individual properties
    # (e.g. exon_number) if these are not quoted. This does not apply to
    # asDict, which uses a different parsing implementation (v0.7.8).
    properties = gtf.asDict()

    gene_type = properties.get("gene_biotype")
    if gene_type is None:
        gene_type = properties.get("gene_type", "unknown_genetype")

    keys = (gene_type,
            properties["gene_id"],
            properties["transcript_id"],
            int(properties["exon_number"]),
            gtf.feature)

    record = {"contig": contig_prefix + gtf.contig,
              "start": gtf.start,
              # In pysam, 'end' equals the past-the-end position
              "end": gtf.end - 1,
              "strand": gtf.strand,
              "feature": gtf.feature,
              "transcript": properties["transcript_id"]}

    if record["contig"] in scaffolds:
        contig = scaffolds[record["contig"]]
        record["contig"] = contig["chrom"]
        record["start"] += int(contig["chromStart"])
        record["end"] += int(contig["chromStart"])

    assert not get_in(table, keys), keys
    set_in(table, keys, record)
Ejemplo n.º 4
0
def create_or_get_subtable(table, subtable_key, size):
    subtable = get_in(table, subtable_key)
    if subtable is None:
        subtable = ReadGroup()
        subtable.Size = size
        set_in(table, subtable_key, subtable)
    return subtable
Ejemplo n.º 5
0
def create_or_get_subtable(table, subtable_key, size):
    subtable = get_in(table, subtable_key)
    if subtable is None:
        subtable = ReadGroup()
        subtable.Size = size
        set_in(table, subtable_key, subtable)
    return subtable
Ejemplo n.º 6
0
    def _collect_clade_from(self, cache, p_node, c_node):
        c_clade = get_in(cache, (p_node, c_node), set())
        if not c_clade:
            if self.is_leaf(c_node):
                c_clade.add(c_node)

            for n_node in self.connections[c_node]:
                if n_node != p_node:
                    c_clade.update(
                        self._collect_clade_from(cache, c_node, n_node))
            set_in(cache, (p_node, c_node), frozenset(c_clade))
        return c_clade
Ejemplo n.º 7
0
def convert_reads(config, destination, record, sink_cache):
    # Source name is used, to re-merge split lanes
    name = record.tags.get("PU_src")
    destination = os.path.join(destination, name)
    make_dirs(os.path.join(config.destination, destination))

    def _open_se_sink(reads_type):
        key = (name, reads_type)
        if not get_in(sink_cache, key):
            filename = ReadSink.get_filename(destination, reads_type.lower())
            set_in(sink_cache, key, ReadSink.open(config.destination, filename))
        return key

    for (reads_type, bam_files) in record.bams.iteritems():
        # Processed reads are pre-aligned BAMs which have been cleaned up
        if reads_type in ("Paired", "Processed"):
            # Record "Single" reads; these may result from orphan SE reads
            _open_se_sink("Singleton")

            key = (name, "Paired")
            if not get_in(sink_cache, key):
                set_in(sink_cache, key, PEReadSink.open(config.destination,
                                                        destination))
        else:
            key = _open_se_sink(reads_type)

        sink = get_in(sink_cache, key)
        for filename in bam_files:
            print("%sProcessing file %r" % (_INDENTATION * 4, filename))
            with pysam.Samfile(filename) as handle:
                def _keep_record(record):
                    return (record.qual >= config.min_quality) and \
                        (len(record.seq) >= config.min_length)

                sink.write_records(record for record in handle
                                   if _keep_record(record))
Ejemplo n.º 8
0
def convert_reads(config, destination, record, sink_cache):
    # Source name is used, to re-merge split lanes
    name = record.tags.get("PU_src")
    destination = os.path.join(destination, name)
    make_dirs(os.path.join(config.destination, destination))

    def _open_se_sink(reads_type):
        key = (name, reads_type)
        if not get_in(sink_cache, key):
            filename = ReadSink.get_filename(destination, reads_type.lower())
            set_in(sink_cache, key, ReadSink.open(config.destination, filename))
        return key

    for (reads_type, bam_files) in record.bams.iteritems():
        # Processed reads are pre-aligned BAMs which have been cleaned up
        if reads_type in ("Paired", "Processed"):
            # Record "Single" reads; these may result from orphan SE reads
            _open_se_sink("Singleton")

            key = (name, "Paired")
            if not get_in(sink_cache, key):
                set_in(sink_cache, key, PEReadSink.open(config.destination,
                                                        destination))
        else:
            key = _open_se_sink(reads_type)

        sink = get_in(sink_cache, key)
        for filename in bam_files:
            print("%sProcessing file %r" % (_INDENTATION * 4, filename))
            with pysam.Samfile(filename) as handle:
                def _keep_record(record):
                    return (record.qual >= config.min_quality) and \
                        (len(record.seq) >= config.min_length)

                sink.write_records(record for record in handle
                                   if _keep_record(record))
Ejemplo n.º 9
0
def read_table(table, filename):
    with open(filename) as table_file:
        for record in parse_padded_table(table_file):
            key = (record["Name"], record["Sample"],
                   record["Library"], record["Contig"])
            if "*" in key:
                continue

            subtable = get_in(table, key)
            if subtable is None:
                subtable = dict(READGROUP_TEMPLATE)
                subtable["Size"] = int(record["Size"])
                set_in(table, key, subtable)

            assert int(subtable["Size"]) == int(record["Size"])
            for key in READGROUP_TEMPLATE:
                if key != "Size":
                    subtable[key] += int(record.get(key, 0))
Ejemplo n.º 10
0
def read_table(table, filename):
    with open(filename) as table_file:
        for record in parse_padded_table(table_file):
            key = (record["Name"], record["Sample"],
                   record["Library"], record["Contig"])
            if "*" in key:
                continue

            subtable = get_in(table, key)
            if subtable is None:
                subtable = ReadGroup()
                subtable.Size = int(record["Size"])
                set_in(table, key, subtable)

            assert int(subtable.Size) == int(record["Size"])
            for key in ReadGroup.__slots__:
                if key != "Size":
                    subtable[key] += int(record.get(key, 0))
Ejemplo n.º 11
0
    def _read_coverage_tables(cls, key, filenames):
        hits = nts = 0
        for filename in filenames:
            subtable = {}
            read_coverage_table(subtable, filename)
            contigtables = get_in(subtable, key)

            if contigtables is None:
                raise NodeError("Error reading table %r; row not found:"
                                "\n   %s   ...\n\nIf files have been renamed "
                                "during the run, then please remove this file "
                                "in that it may be re-generated.\nHowever, "
                                "note that read-group tags in the BAM files "
                                "may not be correct!"
                                % (filename, "   ".join(key)))

            for contigtable in contigtables.itervalues():
                hits += contigtable["Hits"]
                nts += contigtable["M"]
        return hits, nts
Ejemplo n.º 12
0
def read_table(table, filename):
    with open(filename) as table_file:
        for record in parse_padded_table(table_file):
            key = (
                record["Name"],
                record["Sample"],
                record["Library"],
                record["Contig"],
            )
            if "*" in key:
                continue

            subtable = get_in(table, key)
            if subtable is None:
                subtable = ReadGroup()
                subtable.Size = int(record["Size"])
                set_in(table, key, subtable)

            assert int(subtable.Size) == int(record["Size"])
            for key in ReadGroup.__slots__:
                if key != "Size":
                    subtable[key] += int(record.get(key, 0))
Ejemplo n.º 13
0
def test_get_in__get_value_two_keywords():
    assert utils.get_in({1: {2: 3}}, [1, 2]) == 3
Ejemplo n.º 14
0
 def _open_se_sink(reads_type):
     key = (name, reads_type)
     if not get_in(sink_cache, key):
         filename = ReadSink.get_filename(destination, reads_type.lower())
         set_in(sink_cache, key, ReadSink.open(config.destination, filename))
     return key
Ejemplo n.º 15
0
def test_get_in__get_default_three_keywords_fail_at_first():
    assert utils.get_in({1: {2: {3: 4}}}, [2, 2, 4]) is None
Ejemplo n.º 16
0
def test_get_in__get_default_one_keyword():
    assert utils.get_in({1: 2}, [2]) is None
Ejemplo n.º 17
0
def test_get_in__get_value_one_keyword():
    assert utils.get_in({1: 2}, [1]) == 2
Ejemplo n.º 18
0
 def _open_se_sink(reads_type):
     key = (name, reads_type)
     if not get_in(sink_cache, key):
         filename = ReadSink.get_filename(destination, reads_type.lower())
         set_in(sink_cache, key, ReadSink.open(config.destination, filename))
     return key