def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None):
        combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = self.record_d_fusion.keys()

        for _id, records in GFF.collapseGFFFusionReader(gff_filename):
            match_seqid = self.match_fusion_record(records)
            if match_seqid is not None:
                combined.append((self.record_d_fusion[match_seqid], records))
                try:
                    unmatched_recs.remove(match_seqid)
                except ValueError:
                    pass # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, records))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d_fusion[seqid], None))

        #return combined

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1s,r2s) in enumerate(combined):
            if r2s is None or (r1s is not None and r1s[0].end-r1s[0].start > r2s[0].end-r2s[0].start):
                final_tree[r1s[0].chr][r1s[0].strand].insert(r1s[0].start, r1s[0].end, i)
            else:
                final_tree[r2s[0].chr][r2s[0].strand].insert(r2s[0].start, r2s[0].end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
    def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None):
        combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None)
        unmatched_recs = list(self.record_d_fusion.keys())

        for _id, records in GFF.collapseGFFFusionReader(gff_filename):
            match_seqid = self.match_fusion_record(records)
            if match_seqid is not None:
                combined.append((self.record_d_fusion[match_seqid], records))
                try:
                    unmatched_recs.remove(match_seqid)
                except ValueError:
                    pass # already deleted, OK, this happens for single-exon transcripts
            else:  # r is not present in current tree
                combined.append((None, records))
        # put whatever is left from the tree in
        for seqid in unmatched_recs:
            combined.append((self.record_d_fusion[seqid], None))

        # create a ClusterTree to re-calc the loci/transcripts
        final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)})
        for i,(r1s,r2s) in enumerate(combined):
            if r2s is None or (r1s is not None and r1s[0].end-r1s[0].start > r2s[0].end-r2s[0].start):
                final_tree[r1s[0].chr][r1s[0].strand].insert(r1s[0].start, r1s[0].end, i)
            else:
                final_tree[r2s[0].chr][r2s[0].strand].insert(r2s[0].start, r2s[0].end, i)

        self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
Ejemplo n.º 3
0
def sample_sanity_check(group_filename,
                        gff_filename,
                        count_filename,
                        fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [
        fusion_id
        for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename)
    ]
    f = open(count_filename)
    for i in xrange(14):
        f.readline()  # just through the header
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1)) > 0 or len(
            set(ids2).difference(ids3)) > 0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [
            r.id.split('|')[0]
            for r in SeqIO.parse(open(fastq_filename), 'fastq')
        ]
        if len(set(ids2).difference(ids4)) > 0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
    def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, fastq_filename=None, fusion_max_dist=10):
        """
        Differences with non-fusion MegaPBTree:

        1. allow_5merge is always FALSE. Not a parameter.
        2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions
        """
        super(MegaPBTreeFusion, self).__init__(gff_filename, group_filename, internal_fuzzy_max_dist, self_prefix, False, fastq_filename)

        self.fusion_max_dist = fusion_max_dist

        # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2]
        self.record_d_fusion = dict((fusion_id, records) for fusion_id,records in GFF.collapseGFFFusionReader(gff_filename))
    def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, fastq_filename=None, fusion_max_dist=10):
        """
        Differences with non-fusion MegaPBTree:

        1. allow_5merge is always FALSE. Not a parameter.
        2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions
        """
        super(MegaPBTreeFusion, self).__init__(gff_filename, group_filename, internal_fuzzy_max_dist, self_prefix, False, fastq_filename)

        self.fusion_max_dist = fusion_max_dist

        # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2]
        self.record_d_fusion = dict((fusion_id, records) for fusion_id,records in GFF.collapseGFFFusionReader(gff_filename))
Ejemplo n.º 6
0
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [fusion_id for fusion_id,rs in GFF.collapseGFFFusionReader(gff_filename)]
    f = open(count_filename)
    for i in xrange(14): f.readline() # just through the header
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1))>0 or len(set(ids2).difference(ids3))>0:
        raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename)

    if fastq_filename is not None:
        ids4 = [r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq')]
        if len(set(ids2).difference(ids4))>0:
            raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename)
Ejemplo n.º 7
0
def sample_sanity_check(group_filename,
                        gff_filename,
                        count_filename,
                        fastq_filename=None):
    """
    Double check that the formats are expected and all PBIDs are concordant across the files
    :return: raise Exception if sanity check failed
    """
    print("Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\
        group_filename, gff_filename, count_filename), file=sys.stderr)
    ids1 = [line.strip().split()[0] for line in open(group_filename)]
    ids2 = [
        fusion_id
        for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename)
    ]
    f = open(count_filename)
    while True:
        # advance through the headers which start with #
        cur = f.tell()
        if not f.readline().startswith('#') or f.tell(
        ) == cur:  # first non-# seen or EOF
            f.seek(cur)
            break
    ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')]
    if len(set(ids2).difference(ids1)) > 0 or len(
            set(ids2).difference(ids3)) > 0:
        raise Exception("Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\
            group_filename, gff_filename, count_filename))

    if fastq_filename is not None:
        ids4 = [
            r.id.split('|')[0]
            for r in SeqIO.parse(open(fastq_filename), 'fastq')
        ]
        if len(set(ids2).difference(ids4)) > 0:
            raise Exception("Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\
                fastq_filename, gff_filename))