def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None): combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None) unmatched_recs = self.record_d_fusion.keys() for _id, records in GFF.collapseGFFFusionReader(gff_filename): match_seqid = self.match_fusion_record(records) if match_seqid is not None: combined.append((self.record_d_fusion[match_seqid], records)) try: unmatched_recs.remove(match_seqid) except ValueError: pass # already deleted, OK, this happens for single-exon transcripts else: # r is not present in current tree combined.append((None, records)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append((self.record_d_fusion[seqid], None)) #return combined # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)}) for i,(r1s,r2s) in enumerate(combined): if r2s is None or (r1s is not None and r1s[0].end-r1s[0].start > r2s[0].end-r2s[0].start): final_tree[r1s[0].chr][r1s[0].strand].insert(r1s[0].start, r1s[0].end, i) else: final_tree[r2s[0].chr][r2s[0].strand].insert(r2s[0].start, r2s[0].end, i) self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
def add_sample(self, gff_filename, group_filename, sample_prefix, output_prefix, fastq_filename=None): combined = [] # list of (r1 if r2 is None | r2 if r1 is None | longer of r1 or r2 if both not None) unmatched_recs = list(self.record_d_fusion.keys()) for _id, records in GFF.collapseGFFFusionReader(gff_filename): match_seqid = self.match_fusion_record(records) if match_seqid is not None: combined.append((self.record_d_fusion[match_seqid], records)) try: unmatched_recs.remove(match_seqid) except ValueError: pass # already deleted, OK, this happens for single-exon transcripts else: # r is not present in current tree combined.append((None, records)) # put whatever is left from the tree in for seqid in unmatched_recs: combined.append((self.record_d_fusion[seqid], None)) # create a ClusterTree to re-calc the loci/transcripts final_tree = defaultdict(lambda: {'+': ClusterTree(0, 0), '-':ClusterTree(0, 0)}) for i,(r1s,r2s) in enumerate(combined): if r2s is None or (r1s is not None and r1s[0].end-r1s[0].start > r2s[0].end-r2s[0].start): final_tree[r1s[0].chr][r1s[0].strand].insert(r1s[0].start, r1s[0].end, i) else: final_tree[r2s[0].chr][r2s[0].strand].insert(r2s[0].start, r2s[0].end, i) self.write_cluster_tree_as_gff(final_tree, combined, group_filename, sample_prefix, output_prefix, fastq_filename2=fastq_filename)
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None): """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\ group_filename, gff_filename, count_filename) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [ fusion_id for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename) ] f = open(count_filename) for i in xrange(14): f.readline() # just through the header ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')] if len(set(ids2).difference(ids1)) > 0 or len( set(ids2).difference(ids3)) > 0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\ group_filename, gff_filename, count_filename) if fastq_filename is not None: ids4 = [ r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq') ] if len(set(ids2).difference(ids4)) > 0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\ fastq_filename, gff_filename)
def __init__(self, gff_filename, group_filename, internal_fuzzy_max_dist=0, self_prefix=None, fastq_filename=None, fusion_max_dist=10): """ Differences with non-fusion MegaPBTree: 1. allow_5merge is always FALSE. Not a parameter. 2. fusion_max_dist --- maximum allowed distance on internal fusion sites to be called as equivalent fusions """ super(MegaPBTreeFusion, self).__init__(gff_filename, group_filename, internal_fuzzy_max_dist, self_prefix, False, fastq_filename) self.fusion_max_dist = fusion_max_dist # ex: PBfusion.1 -> [PBfusion.1.1, PBfusion.1.2] self.record_d_fusion = dict((fusion_id, records) for fusion_id,records in GFF.collapseGFFFusionReader(gff_filename))
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None): """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ print >> sys.stderr, "Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\ group_filename, gff_filename, count_filename) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [fusion_id for fusion_id,rs in GFF.collapseGFFFusionReader(gff_filename)] f = open(count_filename) for i in xrange(14): f.readline() # just through the header ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')] if len(set(ids2).difference(ids1))>0 or len(set(ids2).difference(ids3))>0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\ group_filename, gff_filename, count_filename) if fastq_filename is not None: ids4 = [r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq')] if len(set(ids2).difference(ids4))>0: raise Exception, "Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\ fastq_filename, gff_filename)
def sample_sanity_check(group_filename, gff_filename, count_filename, fastq_filename=None): """ Double check that the formats are expected and all PBIDs are concordant across the files :return: raise Exception if sanity check failed """ print("Sanity checking. Retrieving PBIDs from {0},{1},{2}...".format(\ group_filename, gff_filename, count_filename), file=sys.stderr) ids1 = [line.strip().split()[0] for line in open(group_filename)] ids2 = [ fusion_id for fusion_id, rs in GFF.collapseGFFFusionReader(gff_filename) ] f = open(count_filename) while True: # advance through the headers which start with # cur = f.tell() if not f.readline().startswith('#') or f.tell( ) == cur: # first non-# seen or EOF f.seek(cur) break ids3 = [r['pbid'] for r in DictReader(f, delimiter='\t')] if len(set(ids2).difference(ids1)) > 0 or len( set(ids2).difference(ids3)) > 0: raise Exception("Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0} and {2}".format(\ group_filename, gff_filename, count_filename)) if fastq_filename is not None: ids4 = [ r.id.split('|')[0] for r in SeqIO.parse(open(fastq_filename), 'fastq') ] if len(set(ids2).difference(ids4)) > 0: raise Exception("Sanity check failed! Please make sure the PBIDs listed in {1} are also in {0}".format(\ fastq_filename, gff_filename))