def merge_loci(df): # Cluster the loci tree = ClusterTree(0,0) for i,marker in enumerate(df.index): tree.insert(df.ix[marker,'locus_gene_boundaries'][0], df.ix[marker,'locus_gene_boundaries'][1],i) # Create new dataframe with overlapping loci df_addon = pd.DataFrame() rows2drop = [] for i, (start, end, overlapping_loci) in enumerate(tree.getregions()): if len(overlapping_loci) > 1: marker = ";".join(df.index[overlapping_loci]) df_addon.ix[marker,'snp_id'] = ";".join(df.ix[overlapping_loci,'snp_id']) df_addon.ix[marker,'locus_start'] = start df_addon.ix[marker,'locus_end'] = end genes_in_locus_set = set((";".join([ (str(x) if not isinstance(x, float) else '') for x in df.ix[overlapping_loci,'genes_in_locus']])).split(";")) genes_in_locus_set.discard('') df_addon.ix[marker,'genes_in_locus'] = ";".join(genes_in_locus_set) df_addon.ix[marker,'nearest_gene'] = ";".join(df.ix[overlapping_loci,'nearest_gene']) df_addon.ix[marker,'chr'] = df.ix[overlapping_loci[0],'chr'] rows2drop.extend(overlapping_loci) # Add merged locus and drop ovlerapping loci if not df.empty: df = df.drop(df.index[rows2drop]) df = df.append(df_addon) return df
def sep_by_clustertree(records): tree = ClusterTree(0,0) for i,r in enumerate(records): tree.insert(r.sStart, r.sEnd, i) result = [] for s,e,indices in tree.getregions(): result.append([records[i] for i in indices]) return result
def get_gtf_metadata(gtf_file, omit_attrs=None, group_by="gene_id", feature_type="exon"): if omit_attrs is None: omit_attrs = [] # read gtf file and group by gene gene_feature_map = collections.defaultdict(lambda: []) gene_attrs_set = set() for feature in GTFFeature.parse(open(gtf_file)): if feature.feature_type != feature_type: continue feature_id = feature.attrs[group_by] gene_feature_map[feature_id].append(feature) gene_attrs_set.update(feature.attrs.keys()) gene_attrs_set.difference_update(omit_attrs) gene_attrs_list = sorted(gene_attrs_set) metadata_fields = ["tracking_id", "locus", "strand", "num_exons", "transcript_length"] + gene_attrs_list metadata_inds = dict((x,i) for i,x in enumerate(metadata_fields)) metadata_dict = {} # output metadata sorted by gene id for feature_id,features in gene_feature_map.iteritems(): # collect attributes for this gene attrdict = collections.defaultdict(lambda: set()) # cluster exons together for each gene cluster_tree = ClusterTree(0,1) for i,f in enumerate(features): cluster_tree.insert(f.start, f.end, i) for k,v in f.attrs.iteritems(): if k in gene_attrs_set: # some attributes have multiple values separated by a comma attrdict[k].update(v.split(',')) # determine larger exon clusters transcript_length = 0 exon_clusters = [] for start, end, indexes in cluster_tree.getregions(): exon_clusters.append((start,end)) transcript_length += (end - start) del cluster_tree chrom = features[0].seqid locus_start = min(e[0] for e in exon_clusters) locus_end = max(e[1] for e in exon_clusters) locus_string = "%s:%d-%d" % (chrom, locus_start, locus_end) strand = features[0].strand num_exons = len(exon_clusters) # make metadata row metadata = [feature_id, locus_string, strand, num_exons, transcript_length] + ['NA'] * len(gene_attrs_list) # get all attributes for k,vals in attrdict.iteritems(): ind = metadata_inds[k] metadata[ind] = ','.join(map(str, sorted(vals))) metadata_dict[metadata[0]] = metadata return metadata_fields, metadata_dict
def read_GFF(gff_filename, logf): """ Read a GFF filename and get the gene regions :return: dict of (PB.X) --> LocusInfo """ gff_info = {} # loci --> LocusInfo tmp = {} # loci PB.X --> list of GFF records for PB.X.Y for r in collapseGFFReader(gff_filename): m = rex_pbid.match(r.seqid) if m is None: raise Exception, "Expected PBID format PB.X.Y but saw {0}".format(r.seqid) locus = m.group(1) # ex: PB.1 if locus not in tmp: tmp[locus] = [r] gff_info[locus] = LocusInfo(chrom=r.chr, strand=r.strand, regions=None, isoforms=None) else: if gff_info[locus].chrom!=r.chr: logf.write("WARNING: Expected {0} to be on {1} but saw {2}. Could be minimap2 multi-mapping inconsistency for repetitive genes. Check later.\n".format(\ r.seqid, gff_info[locus].chrom, r.chr)) tmp[locus].append(r) # now figure out the exonic regions for each gene PB.X for locus, records in tmp.iteritems(): c = ClusterTree(0, 0) for r in records: for e in r.ref_exons: c.insert(e.start-extra_bp_around_junctions, e.end+extra_bp_around_junctions, 1) regions = [(a,b) for (a,b,junk) in c.getregions()] regions[0] = (regions[0][0]-__padding_before_after__, regions[0][1]) regions[-1] = (regions[-1][0], regions[-1][1]+__padding_before_after__) gff_info[locus] = LocusInfo(chrom=gff_info[locus].chrom, strand=gff_info[locus].strand, regions=regions, isoforms=[r.seqid for r in records]) return gff_info
def cluster_txs(txs): """Uses a ClusterTree to cluster to cluster transcript objects. TODO: use clusterGenes instead""" cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1)) for i, tx in enumerate(txs): cluster_trees[tx.chromosome].insert(tx.start, tx.stop, i) # convert the clusters to a nested structure of chrom -> cluster_id -> tx objects clustered_reads = collections.defaultdict(dict) cluster_id = 0 for chrom, cluster_tree in cluster_trees.iteritems(): for start, end, interval_indices in cluster_tree.getregions(): clustered_reads[chrom][cluster_id] = [ txs[ix] for ix in interval_indices ] cluster_id += 1 return clustered_reads
def combine_regions(all_regions, required_regions=1): """Generate the combination of a set of chrom, start, end regions. If required_regions is 1 then this is a union combination. Otherwise it is an intersection. """ clusters = collections.defaultdict( lambda: ClusterTree(0, required_regions)) i = 0 for region_gen in all_regions: for chrom, start, end in region_gen: clusters[chrom].insert(start, end, i) i += 1 for chrom, cluster in clusters.iteritems(): for (s, e, _) in cluster.getregions(): yield chrom, s, e
def _cluster_by(end_iter, attr1, attr2, cluster_distance): """Cluster breakends by specified attributes. """ ClusterInfo = namedtuple("ClusterInfo", ["chroms", "clusters", "lookup"]) chr_clusters = {} chroms = [] brends_by_id = {} for brend in end_iter: if not chr_clusters.has_key(brend.chrom1): chroms.append(brend.chrom1) chr_clusters[brend.chrom1] = ClusterTree(cluster_distance, 1) brends_by_id[int(brend.name)] = brend chr_clusters[brend.chrom1].insert(getattr(brend, attr1), getattr(brend, attr2), int(brend.name)) return ClusterInfo(chroms, chr_clusters, brends_by_id)
def build_cluster_trees(bed_generator, cluser_distance, read_count): """ arguments to ClusterTree are: - Distance in basepairs for two reads to be in the same cluster; for instance 20 would group all reads with 20bp of each other - Number of reads necessary for a group to be considered a cluster; 2 returns all groups with 2 or more overlapping reads """ if options.verbose: syserr("Making ClusterTree\n") cluster_trees = collections.defaultdict(lambda: ClusterTree(cluser_distance, read_count)) i = 0 read_ids_mapping = {} for read_id, match_id, strand, start, end in bed_generator: cluster_trees["%s:%s" % (match_id, strand)].insert(start, end, i) read_ids_mapping[i] = read_id i += 1 return cluster_trees, read_ids_mapping
def build_cluster_trees(reads, cluster_distance=2, min_size=2): """Build cluster tree of reads from a dataframe of locations e.g from a set of aligned reads from a sam file. Args: cluster_distance: Distance in basepairs for two reads to be in the same cluster; for instance 20 would group all reads with 20bp of each other min_size: Number of reads necessary for a group to be considered a cluster; 2 returns all groups with 2 or more overlapping reads Returns: dict of ClusterTrees per chromosome """ import collections from bx.intervals.cluster import ClusterTree cluster_trees = collections.defaultdict(lambda: ClusterTree(cluster_distance, min_size)) for i, row in reads.iterrows(): chrom = row['name'] #print chrom, row.read_id, row.start, row.end cluster_trees[chrom].insert(row.start, row.end, row.name) return dict(cluster_trees)
def categorize_aln_by_annotation( gene_annotation_file: str, input_fasta: str, input_sam: str, output_prefix: str, min_overlap_bp: int = 200, min_query_overlap: float = 0.5, min_gene_overlap: float = 0.8, ) -> None: t = defaultdict( lambda: {"+": IntervalTree(), "-": IntervalTree()} ) # chr -> strand -> IntervalTree info = {} # reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t') for r in DictReader(open(gene_annotation_file), delimiter="\t"): if r["#Replicon Name"] != "chr": logger.info(f"Ignore {r}") continue info[r["Locus tag"]] = (int(r["Start"]), int(r["Stop"]), r["Locus tag"]) t[r["Replicon Accession"]][r["Strand"]].add( int(r["Start"]), int(r["Stop"]), r["Locus tag"] ) # pdb.set_trace() result = defaultdict(lambda: []) # gene -> list of rec d = {r.id: len(r.seq) for r in SeqIO.parse(open(input_fasta), "fasta")} reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d) for r in reader: # if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS': # pdb.set_trace() ans = match_w_annotation( t, r, info, min_overlap_bp, min_query_overlap, min_gene_overlap ) # ans is AMatch(name, strand, start, end, record) result[ans.name].append(ans) novel_ct = defaultdict(lambda: {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)}) novel_list = [] novel_index = 0 with open(f"{output_prefix}.sam", "w") as f, open( f"{output_prefix}.report.txt", "w" ) as f1: f.write(reader.header) f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n") for k, v in result.items(): # v is: list of AMatch(name, strand, start, end, record) if k.startswith("novel-unannotated"): # write novel later, we are grouping them by loci first # tagRG='novel' for x in v: novel_ct[x.record.sID][x.strand].insert(x.start, x.end, novel_index) novel_index += 1 novel_list.append(x) continue elif k.startswith("novel-antisense"): tagRG = "novel-antisense" elif k.startswith("novel-partial"): tagRG = "novel-partial" elif k.startswith("poly-"): tagRG = "poly" else: tagRG = "single" v.sort( key=lambda x: (x.start, x.end), reverse=bool(v[0].strand == "-"), ) # sort by start, then end for i, x in enumerate(v): f.write( f"{x.record.record_line}\tSN:Z:{i + 1:06d}\tRG:Z:{tagRG}\tgn:Z:{k}\n" ) if x.strand == "+": f1.write( f"{x.record.qID}\t{tagRG}\t{k}\t{i + 1:06d}\t{x.strand}\t{x.start + 1}\t{x.end}\n" ) else: # - strand, start is end, end is start f1.write( f"{x.record.qID}\t{tagRG}\t{k}\t{i + 1:06d}\t{x.strand}\t{x.end}\t{x.start + 1}\n" ) # now write the novel stuff, grouped by regions novel_region_index = 1 for d1 in novel_ct.values(): for ct in d1.values(): gn = f"novel-{str(novel_region_index)}" for *_, _indices in ct.getregions(): v = [novel_list[ind] for ind in _indices] v.sort( key=lambda x: (x.start, x.end), reverse=bool(v[0].strand == "-"), ) # sort by start, then end for i, x in enumerate(v): f.write( f"{x.record.record_line}\tSN:Z:{i + 1:06d}\tRG:Z:{'novel-unannotated'}\tgn:Z:{gn}\n" ) if x.strand == "+": f1.write( f"{x.record.qID}\t{'novel-unannotated'}\t{gn}\t{i + 1:06d}\t{x.strand}\t{x.start + 1}\t{x.end}\n" ) else: f1.write( f"{x.record.qID}\t{'novel-unannotated'}\t{gn}\t{i + 1:06d}\t{x.strand}\t{x.end}\t{x.start + 1}\n" ) novel_region_index += 1 logger.info(f"Output written to: {f.name}") logger.info(f"Output written to: {f1.name}")
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group, addon_name, fuzzy_junction, allow_5merge, max_3_diff, n_chunks): addon_group_info = sp.MegaPBTree.read_group(addon_group, None) recs = [] tree = OrderedDict() i = 0 for r in GFF.collapseGFFReader(addon_gff): if r.chr not in tree: tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)} tree[r.chr][r.strand].insert(r.start, r.end, i) recs.append(r) i += 1 n = len(recs) chunk_size = (n // n_chunks) + (n % n_chunks > 0) #print("# of recs: {0}, cpus: {1}, chunk_size: {2}".format(n, n_chunks, chunk_size)) split_files = [] i = 0 counter = 0 f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') for v1 in tree.values(): for strand in ('+', '-'): v2 = v1[strand] for _start, _end, _indices in v2.getregions(): for cur in _indices: GFF.write_collapseGFF_format(f_gff, recs[cur]) f_group.write("{0}\t{1}\n".format( recs[cur].seqid, ",".join(addon_group_info[recs[cur].seqid]))) counter += 1 # note: becuz we are limited by how the records are organized by (chrom, strand) # we may not end up using all the chunks, ex: if all records are on the same locus, we end up writing everything to one split file if counter >= (i + 1) * chunk_size: i += 1 f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) if i >= n_chunks or counter >= len(recs): break f_gff = open(addon_gff + '.split' + str(i), 'w') f_group = open(addon_group + '.split' + str(i), 'w') if not f_gff.closed: f_gff.close() f_group.close() split_files.append((f_gff.name, f_group.name)) result_prefixes = [] pools = [] for i, (split_gff, split_group) in enumerate(split_files): p = Process(target=chain_helper, args=(ref_gff, ref_group, split_gff, split_group, ref_name, addon_name + '.' + str(i), fuzzy_junction, allow_5merge, max_3_diff)) p.start() pools.append(p) result_prefixes.append((ref_name, addon_name + '.' + str(i))) for p in pools: p.join() #print("split files: {0}, result_prefix: {1}".format(split_files, result_prefixes)) return result_prefixes, split_files
exonNum+1, gene.id, gene.id, trans_id )) ################################################################################ if __name__ == '__main__': # build clusters clusterReads.c = 0 readDict = { } clusterDist = 50 clusterMembers = 1 cluster_treesP = collections.defaultdict(lambda:ClusterTree(clusterDist, clusterMembers)) cluster_treesN = collections.defaultdict(lambda:ClusterTree(clusterDist, clusterMembers)) clusterReads(args.bamfile, cluster_treesP, cluster_treesN, readDict) keys = list(set.union(*[set(cluster_treesN.keys()), set(cluster_treesP.keys())])) # Transcript assembly geneIsos = collections.defaultdict(list) geneReads = collections.defaultdict(list) novelClustersN = collections.defaultdict(list) novelClustersP = collections.defaultdict(list) allIsos = [] allClusters = [ ] allReads = 0 isosP = collections.defaultdict(list)
def make_fake_genome(genome_filename, gff_filename, ref_chr, ref_start, ref_end, ref_strand, output_prefix, output_name, genome_d=None): if genome_d is None: print >> sys.stderr, "Reading genome file {0}...".format( genome_filename) d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), 'fasta')) else: d = genome_d print >> sys.stderr, "Reading GFF file {0}...".format(gff_filename) good = [] reader = GFF.collapseGFFReader(gff_filename) for r in reader: if r.chr==ref_chr and r.strand==ref_strand and \ (ref_start <= r.start < r.end <= ref_end) \ and len(r.ref_exons) > 1: print >> sys.stderr, "Adding {0} to fake genome.".format(r.seqid) good.append(r) if len(good) == 0: print >> sys.stderr, "Did not find any transcripts strictly within {0}:{1}-{2} on strand {3}. Abort!".format(\ ref_chr, ref_start, ref_end, ref_strand) sys.exit(-1) c = ClusterTree(0, 0) for r in good: for e in r.ref_exons: c.insert(e.start - extra_bp_around_junctions, e.end + extra_bp_around_junctions, 1) regions = [(a, b) for (a, b, junk) in c.getregions()] regions[0] = (regions[0][0] - __padding_before_after__, regions[0][1]) regions[-1] = (regions[-1][0], regions[-1][1] + __padding_before_after__) with open(output_prefix + '.fasta', 'w') as f: f.write(">" + output_name + "\n") for a, b in regions: f.write(str(d[r.chr][a:b].seq)) f.write("\n") f.close() # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome> with open(output_prefix + '.mapping.txt', 'w') as f: i = 0 for a, b in regions: for j in xrange(a, b): f.write("{0},{1},{2}\n".format(i, ref_chr, j)) i += 1 with open(output_prefix + '.pbids.txt', 'w') as f: f.write("\n".join(r.seqid for r in good) + '\n') print >> sys.stderr, "Output written to {0}.fasta, {0}.mapping.txt, {0}.pbids.txt.".format( output_prefix)
def loc2region(li): clu = ClusterTree(0, 0) for x in li: clu.insert(x, x + 1, 0) for start, end, _ in clu.getregions(): yield (start, end)
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2, blastn_filename=None): """ 1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff) 2. read cogent2 mapped to genome1 3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1) """ if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')): return seq_info = defaultdict(lambda: []) contigs_seen = set() # input mapped to Cogent contigs filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.sam') reader = BioReaders.GMAPSAMReader(filename, True, \ query_len_dict=dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta'))) for r in reader: seq_info[r.qID].append(r) contigs_seen.add(r.sID) # sanity check that all sequences in in.fa are mapped to cogent2.fa for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'): assert r.id in seq_info d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname,'cogent2.fa.'+genome1+'.sam')) d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname,'cogent2.fa.'+genome2+'.sam')) if blastn_filename is not None: qlen_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')),'fasta')) best_of = read_blastn(os.path.join(dirname, blastn_filename), qlen_dict) # write: # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info), len(contigs_seen))) cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1) f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(len(contig_genome1), cov1, acc1, has_chimeric1, ",".join(contig_genome1))) # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2) f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}".format(len(contig_genome2), cov2, acc2, has_chimeric2, ",".join(contig_genome2))) # (for blastn, optional) best name with best e-value if blastn_filename is not None: if len(best_of) == 0: f1.write("\t0\tNA\n") else: stuff = best_of.values() # list of (e-value, name) stuff.sort() f1.write("\t{0}\t\"{1}\"\n".format(sum(_n!='NA' for _e,_n in best_of.values()), stuff[0][1])) else: f1.write("\n") in_aligned_to_genome1 = os.path.join(dirname, 'in.trimmed.fa.'+genome1+'.sam') if os.path.exists(in_aligned_to_genome1): d3, junk = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'in.trimmed.fa'), in_aligned_to_genome1) else: d3 = {} for seqid, v in seq_info.iteritems(): contigs = [x.sID for x in v] acc = sum(x.identity*x.qCoverage for x in v)/sum(x.qCoverage for x in v) f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname, ",".join(contigs), acc)) if not seqid in d3: f2.write("NA\t0\tNA\tNA") if blastn_filename is not None: f2.write("\tNA\n") else: f2.write("\n") else: scaffolds = [x.sID for x in d3[seqid]] # calculate cov and acc c = ClusterTree(0,0) for x in d3[seqid]: qlen = x.qLen c.insert(x.qStart, x.qEnd, -1) cov = sum(_e-_s for _s,_e,_junk in c.getregions())*100./qlen acc = sum(x.identity*x.qCoverage for x in d3[seqid])*1./sum(x.qCoverage for x in d3[seqid]) f2.write("{0}\t{1}\t{2}\t{3}".format(",".join(scaffolds), len(scaffolds), cov, acc)) if blastn_filename is not None: f2.write("\t{0}\n".format(best_of[seqid][1])) else: f2.write("\n")
def chain_split_file( ref_gff: Path, ref_group: Path, ref_name: str, addon_gff: Path, addon_group: Path, addon_name: str, fuzzy_junction: int, allow_5merge: bool, max_3_diff: int, n_chunks: int, ) -> Tuple[List[str], List[str]]: """ Organize entries in both a gff and transcript group file and split both such that the original two files are split into chunks where gff.chunk.n covers the same entries as group.chunk.n """ # read in the group_file as a dictionary in the form of # { # 'PB.1.1': ["transcript/1"], # 'PB.1.2': ["transcript/2", "transcript/3"] # } addon_group_info = sp.MegaPBTree.read_group(addon_group, None) # with addon_group.open('r') as ag: # addon_group_info = {_.split('\t')[0]: _.split('\t')[1].split(",") for _ in ag.readlines()} recs = [] tree = OrderedDict() i = 0 # for r in HTSeq.GFF_Reader(addon_gff): # if r.iv.chrom not in tree2: # tree[r.iv.chrom] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)} # tree[r.iv.chrom][r.iv.strand].insert(r.iv.start, r.iv.end, i) # recs.append(r) # i += 1 # This should build a structure in the form of: # {"chrN": # { # "+" : bx.intervals.cluster.clusterTree, # "-" : bx.intervals.cluster.clusterTree, # }, # "chrN+1": # { # "+" : bx.intervals.cluster.clusterTree, # "-" : bx.intervals.cluster.clusterTree, # }, # } # CusterTree objects have the form # [(x,y,[z]), (a,b,[c]), (m,n,[o])] # where each tuple is a range and a list of ids that lie within that range # e.g. (from the bx-python docs): # tree = ClusterTree(0, 0) Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4) # tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])] # NOTE: GFF.collapseGFFReader is a specialized GFF reader that in the attributes # field stores a list of bx.intervals.intersection.Interval objects # describing the exons for r in GFF.collapseGFFReader(addon_gff): if r.chr not in tree: tree[r.chr] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)} tree[r.chr][r.strand].insert(r.start, r.end, i) recs.append(r) i += 1 n = len(recs) chunk_size = (n // n_chunks) + (n % n_chunks > 0) split_files = [] i = 0 counter = 0 f_gff = open(f"{addon_gff}.split{str(i)}", "w") f_group = open(f"{addon_group}.split{str(i)}", "w") # this loop is going to reorder everything # so that we have a GFF with a transcript followed by all the exons that # made up that transcript and a separate file with the matching # transcript_id transcript/read_group# # (see the sp.MegaPBTree above) for v1 in tree.values(): for strand in ("+", "-"): v2 = v1[strand] for *_, _indices in v2.getregions(): for cur in _indices: GFF.write_collapseGFF_format(f_gff, recs[cur]) f_group.write( f"{recs[cur].seqid}\t{','.join(addon_group_info[recs[cur].seqid])}\n" ) counter += 1 if counter >= (i + 1) * chunk_size: i += 1 n = f_gff.tell() f_gff.close() f_group.close() if n == 0: # didn't write any records, delete these Path(f_gff.name).unlink() Path(f_group.name).unlink() else: split_files.append((f_gff.name, f_group.name)) if i >= n_chunks or counter >= len(recs): break f_gff = open(f"{addon_gff}.split{str(i)}", "w") f_group = open(f"{addon_group}.split{str(i)}", "w") if not f_gff.closed: n = f_gff.tell() f_gff.close() f_group.close() if n == 0: # didn't write any records, delete these Path(f_gff.name).unlink() Path(f_group.name).unlink() else: split_files.append((f_gff.name, f_group.name)) result_prefixes = [] pools = [] for i, (split_gff, split_group) in enumerate(split_files): p = Process( target=chain_helper, args=( ref_gff, ref_group, split_gff, split_group, ref_name, f"{addon_name}.{str(i)}", fuzzy_junction, allow_5merge, max_3_diff, ), ) p.start() pools.append(p) result_prefixes.append((ref_name, f"{addon_name}.{str(i)}")) for p in pools: p.join() return result_prefixes, split_files
def total_coverage(tmprecs): tree = ClusterTree(0, 0) for r in tmprecs: tree.insert(r.qStart, r.qEnd, -1) return sum(reg[1] - reg[0] for reg in tree.getregions())
def write_reclist_to_gff_n_info( rec_list: Dict[str, Any], final_prefix: str, ref_name: str, addon_name: str, use_fq: bool = False, ) -> Dict[str, str]: # now go through the rec list and figure out in what order we are outputting the total records tree = defaultdict(lambda: { "+": ClusterTree(0, 0), "-": ClusterTree(0, 0) }) tree_keys_numeric = set() tree_keys_alpha = set() for i, match_rec in enumerate(rec_list): tree[match_rec.rec.chr][match_rec.rec.strand].insert( match_rec.rec.start, match_rec.rec.end, i) for chrom in tree: try: k = int(chrom) tree_keys_numeric.add(k) except ValueError: tree_keys_alpha.add(chrom) tree_keys = sorted(tree_keys_numeric) + sorted(tree_keys_alpha) writer_info = DictWriter( Path(f"{final_prefix}.mega_info.txt").open("w"), fieldnames=["superPBID", ref_name, addon_name], delimiter="\t", ) writer_info.writeheader() if use_fq: f_fq = Path(f"{final_prefix}.rep.fq") with open(f"{final_prefix}.gff", "w") as f_gff, open(f"{final_prefix}.group.txt", "w") as f_group: new_group_info = {} pb_i = 0 for _chr in tree_keys: for _strand in ("+", "-"): for *_, _indices in tree[_chr][_strand].getregions(): # further sort these records by (start, end, num_exons) _indices.sort(key=lambda i: ( rec_list[i].rec.start, rec_list[i].rec.end, len(rec_list[i].rec.ref_exons), )) pb_i += 1 for pb_j, recs_index in enumerate(_indices): pbid = f"PB.{pb_i}.{pb_j + 1}" match_rec = rec_list[recs_index] new_group_info[pbid] = match_rec.members match_rec.rec.seqid = pbid GFF.write_collapseGFF_format(f_gff, match_rec.rec) writer_info.writerow({ "superPBID": pbid, ref_name: match_rec.ref_id, addon_name: match_rec.addon_id, }) f_group.write( f"{pbid}\t{','.join(match_rec.members)}\n") if use_fq: match_rec.seqrec.id = pbid match_rec.seqrec.description = "" SeqIO.write(match_rec.seqrec, f_fq, "fastq") return new_group_info
def make_fake_genome( genome_filename, gff_filename, ref_chr, ref_start, ref_end, ref_strand, output_prefix, output_name=None, genome_d=None, ): if genome_d is None: logger.info(f"Reading genome file {genome_filename}...") d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), "fasta")) else: d = genome_d if output_name is None: output_name = f"fake_{genome_filename}" logger.info(f"Reading GFF file {gff_filename}...") good = [] reader = GFF.collapseGFFReader(gff_filename) for r in reader: if (r.chr == ref_chr and r.strand == ref_strand and (ref_start <= r.start < r.end <= ref_end) and len(r.ref_exons) > 1): logger.info(f"Adding {r.seqid} to fake genome.") good.append(r) if len(good) == 0: raise RuntimeError( f"Did not find any transcripts strictly within {ref_chr}:{ref_start}-{ref_end} on strand {ref_strand}. Abort!" ) c = ClusterTree(0, 0) for r in good: for e in r.ref_exons: c.insert( e.start - extra_bp_around_junctions, e.end + extra_bp_around_junctions, 1, ) regions = [(a, b) for (a, b, junk) in c.getregions()] regions[0] = (regions[0][0] - __padding_before_after__, regions[0][1]) regions[-1] = (regions[-1][0], regions[-1][1] + __padding_before_after__) with open(output_prefix + ".fasta", "w") as f: f.write(">" + output_name + "\n") for a, b in regions: f.write(str(d[r.chr][a:b].seq)) f.write("\n") # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome> with open(output_prefix + ".mapping.txt", "w") as f: i = 0 for a, b in regions: for j in range(a, b): f.write(f"{i},{ref_chr},{j}\n") i += 1 with open(output_prefix + ".pbids.txt", "w") as f: f.write("\n".join(r.seqid for r in good) + "\n") logger.info( f"Output written to {output_prefix}.fasta, {output_prefix}.mapping.txt, {output_prefix}.pbids.txt." )
def write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name, addon_name, use_fq=False): # now go through the rec list and figure out in what order we are outputting the total records tree = defaultdict(lambda: { '+': ClusterTree(0, 0), '-': ClusterTree(0, 0) }) tree_keys_numeric = set() tree_keys_alpha = set() for i, match_rec in enumerate(rec_list): tree[match_rec.rec.chr][match_rec.rec.strand].insert( match_rec.rec.start, match_rec.rec.end, i) for chrom in tree: try: k = int(chrom) tree_keys_numeric.add(k) except ValueError: tree_keys_alpha.add(chrom) tree_keys = sorted(list(tree_keys_numeric)) + sorted(list(tree_keys_alpha)) f_gff = open(final_prefix + '.gff', 'w') f_info = open(final_prefix + '.mega_info.txt', 'w') writer_info = DictWriter(f_info, fieldnames=['superPBID', ref_name, addon_name], delimiter='\t') writer_info.writeheader() f_group = open(final_prefix + '.group.txt', 'w') if use_fq: f_fq = open(final_prefix + '.rep.fq', 'w') # sort the combined gff (tree) by chromosome and strand (- first) new_group_info = {} pb_i = 0 for _chr in tree_keys: # remember to convert potential integer chromsomes keys back to string now that we sorted them! _chr = str(_chr) for _strand in ('+', '-'): for _start, _end, _indices in tree[_chr][_strand].getregions(): # further sort these records by (start, end, num_exons) _indices.sort(key=lambda i: (rec_list[i].rec.start, rec_list[ i].rec.end, len(rec_list[i].rec.ref_exons))) pb_i += 1 for pb_j, recs_index in enumerate(_indices): pbid = "PB.{0}.{1}".format(pb_i, pb_j + 1) match_rec = rec_list[recs_index] new_group_info[pbid] = match_rec.members match_rec.rec.seqid = pbid GFF.write_collapseGFF_format(f_gff, match_rec.rec) writer_info.writerow({ 'superPBID': pbid, ref_name: match_rec.ref_id, addon_name: match_rec.addon_id }) f_group.write("{0}\t{1}\n".format( pbid, ",".join(match_rec.members))) if use_fq: match_rec.seqrec.id = pbid match_rec.seqrec.description = '' SeqIO.write(match_rec.seqrec, f_fq, 'fastq') f_gff.close() f_info.close() f_group.close() if use_fq: f_fq.close() return new_group_info
fout.write( '%s\tprotein_coding\tCDS\t%d\t%d\t.\t%s\t.\t gene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; transcript_name "%s"; protein_id "%s";\n' % (gene.chromosome, exon[0], exon[1], gene.strand, gene.id, trans_id, exonNum + 1, gene.id, gene.id, trans_id)) ################################################################################ if __name__ == '__main__': # build clusters clusterReads.c = 0 readDict = {} clusterDist = 50 clusterMembers = 1 cluster_treesP = collections.defaultdict( lambda: ClusterTree(clusterDist, clusterMembers)) cluster_treesN = collections.defaultdict( lambda: ClusterTree(clusterDist, clusterMembers)) clusterReads(args.bamfile, cluster_treesP, cluster_treesN, readDict) keys = list( set.union(*[set(cluster_treesN.keys()), set(cluster_treesP.keys())])) # Transcript assembly geneIsos = collections.defaultdict(list) geneReads = collections.defaultdict(list) novelClustersN = collections.defaultdict(list) novelClustersP = collections.defaultdict(list) allIsos = [] allClusters = [] allReads = 0
def write_cluster_tree_as_gff( self, cluster_tree: ClusterTree, rec_list: List[GFF.gmapRecord], group_filename2: Union[str, Path], sample_prefix2: str, output_prefix: str, fastq_filename2: Optional[Union[str, Path]] = None, ) -> Dict[str, str]: """ Write ClusterTree (chr --> dict --> (start, end, rec_list_index)) as collapsedGFF format Returns --- a new group_info!!! """ if fastq_filename2 is not None: fastq_dict2 = MegaPBTree.read_fastq_to_dict(fastq_filename2) f_fastq = Path(f"{output_prefix}.rep.fq") group_info2 = MegaPBTree.read_group(group_filename2, sample_prefix2) new_group_info = {} with open(f"{output_prefix}.mega_info.txt", "w") as f_mgroup: f_mgroup.write(f"pbid\t{self.self_prefix}\t{sample_prefix2}\n") fusion_index = 0 chroms = list(cluster_tree.keys()) chroms.sort() for (k) in ( chroms ): # IMPORTANT: for fusion, this is *just* the chrom of the first record! Fusions can be multi-chrom for strand in ("+", "-"): for *_, rec_indices in cluster_tree[k][strand].getregions( ): for i in rec_indices: fusion_index += 1 tID = f"PBfusion.{fusion_index}" r1s, r2s = rec_list[i] if r1s is None: # r2s is not None recs = r2s r2_fusion_id = get_fusion_id(r2s[0].seqid) new_group_info[tID] = group_info2[r2_fusion_id] f_mgroup.write(f"{tID}\tNA\t{r2_fusion_id}\n") if fastq_filename2 is not None: seqrec = fastq_dict2[r2_fusion_id] elif r2s is None: # r1 is not None recs = r1s r1_fusion_id = get_fusion_id(r1s[0].seqid) new_group_info[tID] = self.group_info[ r1_fusion_id] f_mgroup.write(f"{tID}\t{r1_fusion_id}\tNA\n") if fastq_filename2 is not None: seqrec = self.fastq_dict[r1_fusion_id] else: # both r1, r2 are not empty r1_fusion_id = get_fusion_id(r1s[0].seqid) r2_fusion_id = get_fusion_id(r2s[0].seqid) r1_len = sum(x.end - x.start for x in r1s) r2_len = sum(x.end - x.start for x in r2s) if r1_len > r2_len: recs = r1s if fastq_filename2 is not None: seqrec = self.fastq_dict[r1_fusion_id] else: recs = r2s if fastq_filename2 is not None: seqrec = fastq_dict2[r2_fusion_id] new_group_info[tID] = ( self.group_info[r1_fusion_id] + group_info2[r2_fusion_id]) f_mgroup.write( f"{tID}\t{r1_fusion_id}\t{r2_fusion_id}\n") if fastq_filename2 is not None: seqrec.id = tID SeqIO.write(seqrec, open(f_fastq, "w"), "fastq") with open(f"{output_prefix}.group.txt", "w") as f_group: f_group.write( f"{tID}\t{','.join(new_group_info[tID])}\n" ) with open(f"{output_prefix}.gff", "w") as f_out: # now write out the fusion transcript for j, r in enumerate(recs): f_out.write( f'{r.chr}\tPacBio\ttranscript\t{r.start + 1}\t{r.end}\t.\t{strand}\t.\tgene_id "{tID}"; transcript_id "{tID}.{j + 1}";\n' ) for exon in r.ref_exons: f_out.write( f'{r.chr}\tPacBio\texon\t{exon.start + 1}\t{exon.end}\t.\t{strand}\t.\tgene_id "{tID}"; transcript_id "{tID}.{j + 1}";\n' ) return new_group_info
else: reg_cov[line[2]] = [int(line[3])] except: continue infh.close() return reg_cov if __name__ == '__main__': try: repeat_file_1 = sys.argv[1] alignment_file = sys.argv[2] except: print 'Provide repeat region file in WIG format, Alignment file in SAM format' sys.exit(-1) cluster_distance = 1 repeat_regions_50 = collections.defaultdict(lambda:ClusterTree(cluster_distance, 2)) repeat_generator = repeat_parse(repeat_file_1) for match_id, start, end, score in repeat_generator: repeat_regions_50[match_id].insert(start, end, score) #print 'Number of clusters: ' + str(len(repeat_regions_50)) location_db = sam_reader(alignment_file) repeat_cnt = 0 for chrom, cluster_tree in repeat_regions_50.items(): for start, end, scores in cluster_tree.getregions(): for rloc in location_db[chrom]: if (rloc >= start and rloc <= end) or (rloc+80 >= start and rloc+80 <= end):repeat_cnt += 1 print repeat_cnt
def categorize_aln_by_annotation(gene_annotation_file, input_fasta, input_sam, output_prefix, min_overlap_bp=200, min_query_overlap=.5, min_gene_overlap=.8): t = defaultdict(lambda: { '+': IntervalTree(), '-': IntervalTree() }) # chr -> strand -> IntervalTree info = {} #reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t') for r in DictReader(open(gene_annotation_file), delimiter='\t'): if r['#Replicon Name'] != 'chr': print("Ignore", r, file=sys.stderr) continue info[r['Locus tag']] = (int(r['Start']), int(r['Stop']), r['Locus tag']) t[r['Replicon Accession']][r['Strand']].add(int(r['Start']), int(r['Stop']), r['Locus tag']) #pdb.set_trace() result = defaultdict(lambda: []) # gene -> list of rec d = dict( (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), 'fasta')) reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d) for r in reader: #if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS': # pdb.set_trace() ans = match_w_annotation(t, r, info, min_overlap_bp, min_query_overlap, min_gene_overlap) # ans is AMatch(name, strand, start, end, record) result[ans.name].append(ans) novel_ct = defaultdict(lambda: { '+': ClusterTree(0, 0), '-': ClusterTree(0, 0) }) novel_list = [] novel_index = 0 f = open(output_prefix + '.sam', 'w') f.write(reader.header) f1 = open(output_prefix + '.report.txt', 'w') f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n") for k, v in result.items(): # v is: list of AMatch(name, strand, start, end, record) if k.startswith('novel-unannotated'): # write novel later, we are grouping them by loci first #tagRG='novel' for x in v: novel_ct[x.record.sID][x.strand].insert( x.start, x.end, novel_index) novel_index += 1 novel_list.append(x) continue elif k.startswith('novel-antisense'): tagRG = 'novel-antisense' elif k.startswith('novel-partial'): tagRG = 'novel-partial' elif k.startswith('poly-'): tagRG = 'poly' else: tagRG = 'single' v.sort(key=lambda x: (x.start, x.end), reverse=True if v[0].strand == '-' else False) # sort by start, then end for i, x in enumerate(v): f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format( x.record.record_line, i + 1, tagRG, k)) if x.strand == '+': f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, tagRG, k, i+1, x.strand, x.start+1, x.end)) else: # - strand, start is end, end is start f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, tagRG, k, i+1, x.strand, x.end, x.start+1)) # now write the novel stuff, grouped by regions novel_region_index = 1 for d1 in novel_ct.values(): for ct in d1.values(): gn = 'novel-' + str(novel_region_index) for _start, _end, _indices in ct.getregions(): v = [novel_list[ind] for ind in _indices] v.sort(key=lambda x: (x.start, x.end), reverse=True if v[0].strand == '-' else False) # sort by start, then end for i, x in enumerate(v): f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format( x.record.record_line, i + 1, "novel-unannotated", gn)) if x.strand == '+': f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.start+1, x.end)) else: f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\ x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.end, x.start+1)) novel_region_index += 1 f.close() f1.close() print("Output written to:", f.name, file=sys.stderr) print("Output written to:", f1.name, file=sys.stderr)
'locus_downstream_boundary'] = missing_from_collection.plink_ld_partners.apply( find_ld_partner, args=('loci_downstream', )) results_snps_df.update(missing_from_collection) results_snps_df['chr'] = results_snps_df['chr'].astype('int') results_snps_df['pos'] = results_snps_df['pos'].astype('int') results_snps_df['locus_downstream_boundary'] = results_snps_df[ 'locus_downstream_boundary'].astype('int') results_snps_df['locus_upstream_boundary'] = results_snps_df[ 'locus_upstream_boundary'].astype('int') ####### Merge associated loci within 250 kb of each other. trees = {} min_intervals = 0 for i in range(1, 23, 1): trees[i] = ClusterTree(merging_distance_kb, min_intervals) for i, (index, row) in enumerate(results_snps_df.iterrows()): if row.chr in range(1, 23, 1): trees[row.chr].insert(row.locus_upstream_boundary, row.locus_downstream_boundary, i) results_loci_df = pd.DataFrame(columns=[ 'snp_name', 'chr', 'pos', 'pvalue', 'locus_upstream_boundary', 'locus_downstream_boundary' ]) results_snps_df['locus'] = None counter = 0 for chrom in trees: for (start, end, loci) in trees[chrom].getregions():
def total_coverage(tmprecs): tree = ClusterTree(0, 0) for r in tmprecs: tree.insert(r.qStart, r.qEnd, -1) return sum(reg[1]-reg[0] for reg in tree.getregions())
if __name__ == '__main__': try: alignment_file = sys.argv[1] cluster_name = sys.argv[2] cluster_distance = int(sys.argv[3]) except: print __doc__ sys.exit(-1) # - Distance in basepairs for two reads to be in the same cluster; # for instance 20 would group all reads with 20bp of each other # - Number of reads necessary for a group to be considered a cluster; # 100 returns all groups with 100 or more overlapping reads cluster_trees = collections.defaultdict(lambda:ClusterTree(cluster_distance, 100)) read_id_map, cnt, location_db, freq_db = dict(), 0, dict(), dict() align_generator = alignment_parse(alignment_file, cluster_name) for read_id, match_id, start, end in align_generator: if not read_id in read_id_map: #make read id compact cnt +=1 read_id_map[read_id] = cnt cluster_trees[match_id].insert(start, end, read_id_map[read_id]) location_db[read_id_map[read_id]] = start, end #reads location if read_id_map[read_id] in freq_db: #multiple alignments
def tally_for_a_Cogent_dir(dirname, writer1, writer2, genome1, genome2=None, blastn_filename=None): """ 1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff) 2. read cogent2 mapped to genome1 3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1) """ if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')): return seq_info = defaultdict(lambda: []) contigs_seen = set() # input mapped to Cogent contigs filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.sam') reader = BioReaders.GMAPSAMReader(filename, True, \ query_len_dict=dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta'))) for r in reader: seq_info[r.qID].append(r) contigs_seen.add(r.sID) # sanity check that all sequences in in.fa are mapped to cogent2.fa for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'): assert r.id in seq_info d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_sam( os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname, 'cogent2.fa.' + genome1 + '.sam')) if genome2 is not None: d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_sam( os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname, 'cogent2.fa.' + genome2 + '.sam')) if blastn_filename is not None: qlen_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse( open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta')) best_of = read_blastn(os.path.join(dirname, blastn_filename), qlen_dict) # write: # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1) rec1 = { 'gene_family': dirname, 'input_size': len(seq_info), 'num_Cogent_contigs': len(contigs_seen), 'num_genome_contig': len(contig_genome1), 'genome_cov': "{0:.2f}".format(cov1), 'genome_acc': "{0:.2f}".format(acc1), 'genome_chimeric': has_chimeric1, 'genome_contigs': ",".join(contig_genome1) } # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs if genome2 is not None: cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2) rec1['num_genome2_contig'] = len(contig_genome2) rec1['genome2_cov'] = "{0:.2f}".format(cov2) rec1['genome2_acc'] = "{0:.2f}".format(acc2) rec1['genome2_chimeric'] = has_chimeric2 rec1['genome2_contigs'] = ",".join(contig_genome2) # (for blastn, optional) best name with best e-value if blastn_filename is not None: if len(best_of) == 0: rec1['num_blastn'] = 0 rec1['blastn_best'] = 'NA' else: stuff = list(best_of.values()) # list of (e-value, name) stuff.sort() rec1['num_blastn'] = sum(_n != 'NA' for _e, _n in list(best_of.values())) rec1['blastn_best'] = '"' + stuff[0][1] + '"' writer1.writerow(rec1) in_aligned_to_genome1 = os.path.join(dirname, 'in.trimmed.fa.' + genome1 + '.sam') if os.path.exists(in_aligned_to_genome1): d3, junk = read_cogent2_aligned_to_genome_sam( os.path.join(dirname, 'in.trimmed.fa'), in_aligned_to_genome1) else: d3 = {} for seqid, v in seq_info.items(): contigs = [x.sID for x in v] acc = sum(x.identity * x.qCoverage for x in v) / sum(x.qCoverage for x in v) rec2 = { 'seqid': seqid, 'gene_family': dirname, 'Cogent_contig': ",".join(contigs), 'Cogent_contig_acc': acc } if not seqid in d3: rec2['scaffold'] = 'NA' rec2['num_scaffold'] = 0 rec2['scaffold_coverage'] = 'NA' rec2['scaffold_acc'] = 'NA' if blastn_filename is not None: rec2['blastn_best'] = 'NA' else: scaffolds = [x.sID for x in d3[seqid]] # calculate cov and acc c = ClusterTree(0, 0) for x in d3[seqid]: qlen = x.qLen c.insert(x.qStart, x.qEnd, -1) cov = sum(_e - _s for _s, _e, _junk in c.getregions()) * 100. / qlen acc = sum(x.identity * x.qCoverage for x in d3[seqid]) * 1. / sum(x.qCoverage for x in d3[seqid]) rec2['scaffold'] = ",".join(scaffolds) rec2['num_scaffold'] = len(scaffolds) rec2['scaffold_coverage'] = cov rec2['scaffold_acc'] = acc if blastn_filename is not None: rec2['blastn_best'] = best_of[seqid][1] writer2.writerow(rec2)
def main(): args = get_args() if args.files: files = [ f for f in glob.glob(os.path.join(args.bed, '*.bed')) if f in args.files ] assert len(files) == len( args.files), "You have specified files that are not in {0}".format( args.bed) else: files = glob.glob(os.path.join(args.bed, '*.bed')) #pdb.set_trace() for f in files: # setup output files fname = os.path.splitext(os.path.basename(f))[0] print "Processing: {0}".format(fname) outfname = "{0}.json".format(fname) outfile = open(os.path.join(args.outdir, outfname), 'w') # outdata outdata = {} # setup our cluster tree, a name dict, and a counter cluster_trees = defaultdict(lambda: ClusterTree(500, 2)) name_map = {} name_counter = Counter() for line in open(f, 'rU'): if not line.startswith('track'): chromo, start, end, name = line.split("\t")[:4] start, end = int(start), int(end) # convert the BED name to a dict, indexed by unique db id pkey temp_dict = get_dict_from_name(name) key = int(temp_dict['probes-id']) # create a counter of names, so we can check for dupe hits name_counter.update([key]) name_map[key] = temp_dict cluster_trees[chromo].insert(start, end, key) duplicate_hits = check_for_dupe_hits(name_counter) if duplicate_hits: outdata['Duplicates'] = True else: outdata['Duplicates'] = False outdata['Overlaps'] = defaultdict(list) for chromo in cluster_trees: overlaps = cluster_trees[chromo].getregions() if overlaps: for span in overlaps: # get distinct list of *loci* (not probes) hit for a given # region. loci = list( set([ int(name_map[probe]['probes-locus']) for probe in span[2] ])) if len(span[2]) > 1: outdata['Overlaps'][chromo].append({ 'start': span[0], 'end': span[1], 'loci': loci, 'probes': span[2] }) json.dump(outdata, outfile, indent=2)