Beispiel #1
0
def merge_loci(df):

	# Cluster the loci
	tree = ClusterTree(0,0)
	for i,marker in enumerate(df.index):
		tree.insert(df.ix[marker,'locus_gene_boundaries'][0], df.ix[marker,'locus_gene_boundaries'][1],i)

	# Create new dataframe with overlapping loci
	df_addon = pd.DataFrame()
	rows2drop = []
	for i, (start, end, overlapping_loci) in enumerate(tree.getregions()):
		if len(overlapping_loci) > 1:
			marker = ";".join(df.index[overlapping_loci])
			df_addon.ix[marker,'snp_id'] = ";".join(df.ix[overlapping_loci,'snp_id'])
			df_addon.ix[marker,'locus_start'] = start
			df_addon.ix[marker,'locus_end'] = end
			genes_in_locus_set = set((";".join([ (str(x) if not isinstance(x, float) else '') for x in df.ix[overlapping_loci,'genes_in_locus']])).split(";"))
			genes_in_locus_set.discard('')
			df_addon.ix[marker,'genes_in_locus'] = ";".join(genes_in_locus_set)
			df_addon.ix[marker,'nearest_gene'] = ";".join(df.ix[overlapping_loci,'nearest_gene'])
			df_addon.ix[marker,'chr'] = df.ix[overlapping_loci[0],'chr']
			rows2drop.extend(overlapping_loci)
    
	# Add merged locus and drop ovlerapping loci
	if not df.empty:
		df = df.drop(df.index[rows2drop])
		df = df.append(df_addon)
    	return df
Beispiel #2
0
 def sep_by_clustertree(records):
     tree = ClusterTree(0,0)
     for i,r in enumerate(records): tree.insert(r.sStart, r.sEnd, i)
     result = []
     for s,e,indices in tree.getregions():
         result.append([records[i] for i in indices])
     return result
def get_gtf_metadata(gtf_file, 
                     omit_attrs=None,
                     group_by="gene_id", 
                     feature_type="exon"):
    if omit_attrs is None:
        omit_attrs = []
    # read gtf file and group by gene
    gene_feature_map = collections.defaultdict(lambda: [])
    gene_attrs_set = set()
    for feature in GTFFeature.parse(open(gtf_file)):
        if feature.feature_type != feature_type:
            continue
        feature_id = feature.attrs[group_by]
        gene_feature_map[feature_id].append(feature)
        gene_attrs_set.update(feature.attrs.keys())
    gene_attrs_set.difference_update(omit_attrs)
    gene_attrs_list = sorted(gene_attrs_set)
    metadata_fields = ["tracking_id", "locus", "strand", "num_exons", "transcript_length"] + gene_attrs_list
    metadata_inds = dict((x,i) for i,x in enumerate(metadata_fields))
    metadata_dict = {}
    # output metadata sorted by gene id
    for feature_id,features in gene_feature_map.iteritems():
        # collect attributes for this gene
        attrdict = collections.defaultdict(lambda: set())
        # cluster exons together for each gene
        cluster_tree = ClusterTree(0,1)
        for i,f in enumerate(features):
            cluster_tree.insert(f.start, f.end, i)
            for k,v in f.attrs.iteritems():
                if k in gene_attrs_set:
                    # some attributes have multiple values separated by a comma
                    attrdict[k].update(v.split(','))
        # determine larger exon clusters
        transcript_length = 0
        exon_clusters = []
        for start, end, indexes in cluster_tree.getregions():
            exon_clusters.append((start,end))
            transcript_length += (end - start)
        del cluster_tree
        chrom = features[0].seqid
        locus_start = min(e[0] for e in exon_clusters)
        locus_end = max(e[1] for e in exon_clusters)
        locus_string = "%s:%d-%d" % (chrom, locus_start, locus_end)
        strand = features[0].strand
        num_exons = len(exon_clusters)
        # make metadata row
        metadata = [feature_id, locus_string, strand, num_exons, transcript_length] + ['NA'] * len(gene_attrs_list)
        # get all attributes
        for k,vals in attrdict.iteritems():
            ind = metadata_inds[k]
            metadata[ind] = ','.join(map(str, sorted(vals)))
        metadata_dict[metadata[0]] = metadata
    return metadata_fields, metadata_dict
def read_GFF(gff_filename, logf):
    """
    Read a GFF filename and get the gene regions

    :return: dict of (PB.X) --> LocusInfo
    """
    gff_info = {} # loci --> LocusInfo
    tmp = {} # loci PB.X --> list of GFF records for PB.X.Y

    for r in collapseGFFReader(gff_filename):
        m = rex_pbid.match(r.seqid)
        if m is None: raise Exception, "Expected PBID format PB.X.Y but saw {0}".format(r.seqid)
        locus = m.group(1) # ex: PB.1
        if locus not in tmp:
            tmp[locus] = [r]
            gff_info[locus] = LocusInfo(chrom=r.chr, strand=r.strand, regions=None, isoforms=None)
        else:
            if gff_info[locus].chrom!=r.chr:
                logf.write("WARNING: Expected {0} to be on {1} but saw {2}. Could be minimap2 multi-mapping inconsistency for repetitive genes. Check later.\n".format(\
                    r.seqid, gff_info[locus].chrom, r.chr))
            tmp[locus].append(r)


    # now figure out the exonic regions for each gene PB.X
    for locus, records in tmp.iteritems():
        c = ClusterTree(0, 0)
        for r in records:
            for e in r.ref_exons:
                c.insert(e.start-extra_bp_around_junctions, e.end+extra_bp_around_junctions, 1)

        regions = [(a,b) for (a,b,junk) in c.getregions()]
        regions[0] = (regions[0][0]-__padding_before_after__, regions[0][1])
        regions[-1] = (regions[-1][0], regions[-1][1]+__padding_before_after__)
        gff_info[locus] = LocusInfo(chrom=gff_info[locus].chrom,
                                       strand=gff_info[locus].strand,
                                       regions=regions,
                                       isoforms=[r.seqid for r in records])

    return gff_info
def cluster_txs(txs):
    """Uses a ClusterTree to cluster to cluster transcript objects. TODO: use clusterGenes instead"""
    cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1))
    for i, tx in enumerate(txs):
        cluster_trees[tx.chromosome].insert(tx.start, tx.stop, i)
    # convert the clusters to a nested structure of chrom -> cluster_id -> tx objects
    clustered_reads = collections.defaultdict(dict)
    cluster_id = 0
    for chrom, cluster_tree in cluster_trees.iteritems():
        for start, end, interval_indices in cluster_tree.getregions():
            clustered_reads[chrom][cluster_id] = [
                txs[ix] for ix in interval_indices
            ]
            cluster_id += 1
    return clustered_reads
Beispiel #6
0
def combine_regions(all_regions, required_regions=1):
    """Generate the combination of a set of chrom, start, end regions.

    If required_regions is 1 then this is a union combination. Otherwise
    it is an intersection.
    """
    clusters = collections.defaultdict(
        lambda: ClusterTree(0, required_regions))
    i = 0
    for region_gen in all_regions:
        for chrom, start, end in region_gen:
            clusters[chrom].insert(start, end, i)
            i += 1
    for chrom, cluster in clusters.iteritems():
        for (s, e, _) in cluster.getregions():
            yield chrom, s, e
Beispiel #7
0
def _cluster_by(end_iter, attr1, attr2, cluster_distance):
    """Cluster breakends by specified attributes.
    """
    ClusterInfo = namedtuple("ClusterInfo", ["chroms", "clusters", "lookup"])
    chr_clusters = {}
    chroms = []
    brends_by_id = {}
    for brend in end_iter:
        if not chr_clusters.has_key(brend.chrom1):
            chroms.append(brend.chrom1)
            chr_clusters[brend.chrom1] = ClusterTree(cluster_distance, 1)
        brends_by_id[int(brend.name)] = brend
        chr_clusters[brend.chrom1].insert(getattr(brend, attr1),
                                          getattr(brend, attr2),
                                          int(brend.name))
    return ClusterInfo(chroms, chr_clusters, brends_by_id)
def build_cluster_trees(bed_generator, cluser_distance, read_count):
    """
    arguments to ClusterTree are:
    - Distance in basepairs for two reads to be in the same cluster;
      for instance 20 would group all reads with 20bp of each other
    - Number of reads necessary for a group to be considered a cluster;
      2 returns all groups with 2 or more overlapping reads
    """
    if options.verbose:
        syserr("Making ClusterTree\n")
    cluster_trees = collections.defaultdict(lambda: ClusterTree(cluser_distance, read_count))
    i = 0
    read_ids_mapping = {}
    for read_id, match_id, strand, start, end in bed_generator:
        cluster_trees["%s:%s" % (match_id, strand)].insert(start, end, i)
        read_ids_mapping[i] = read_id
        i += 1
    return cluster_trees, read_ids_mapping
Beispiel #9
0
def build_cluster_trees(reads, cluster_distance=2, min_size=2):
    """Build cluster tree of reads from a dataframe of locations e.g from
        a set of aligned reads from a sam file.
    Args:
        cluster_distance: Distance in basepairs for two reads to be in the same cluster;
       for instance 20 would group all reads with 20bp of each other
        min_size: Number of reads necessary for a group to be considered a cluster;
       2 returns all groups with 2 or more overlapping reads
    Returns:
        dict of ClusterTrees per chromosome
    """

    import collections
    from bx.intervals.cluster import ClusterTree
    cluster_trees = collections.defaultdict(lambda:
            ClusterTree(cluster_distance, min_size))
    for i, row in reads.iterrows():
        chrom = row['name']
        #print chrom, row.read_id, row.start, row.end
        cluster_trees[chrom].insert(row.start, row.end, row.name)
    return dict(cluster_trees)
def categorize_aln_by_annotation(
    gene_annotation_file: str,
    input_fasta: str,
    input_sam: str,
    output_prefix: str,
    min_overlap_bp: int = 200,
    min_query_overlap: float = 0.5,
    min_gene_overlap: float = 0.8,
) -> None:

    t = defaultdict(
        lambda: {"+": IntervalTree(), "-": IntervalTree()}
    )  # chr -> strand -> IntervalTree
    info = {}

    # reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t')
    for r in DictReader(open(gene_annotation_file), delimiter="\t"):
        if r["#Replicon Name"] != "chr":
            logger.info(f"Ignore {r}")
            continue
        info[r["Locus tag"]] = (int(r["Start"]), int(r["Stop"]), r["Locus tag"])
        t[r["Replicon Accession"]][r["Strand"]].add(
            int(r["Start"]), int(r["Stop"]), r["Locus tag"]
        )

    # pdb.set_trace()

    result = defaultdict(lambda: [])  # gene -> list of rec
    d = {r.id: len(r.seq) for r in SeqIO.parse(open(input_fasta), "fasta")}

    reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d)
    for r in reader:
        # if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS':
        #    pdb.set_trace()
        ans = match_w_annotation(
            t, r, info, min_overlap_bp, min_query_overlap, min_gene_overlap
        )
        # ans is AMatch(name, strand, start, end, record)
        result[ans.name].append(ans)

    novel_ct = defaultdict(lambda: {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)})
    novel_list = []
    novel_index = 0

    with open(f"{output_prefix}.sam", "w") as f, open(
        f"{output_prefix}.report.txt", "w"
    ) as f1:
        f.write(reader.header)
        f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n")
        for k, v in result.items():
            # v is: list of AMatch(name, strand, start, end, record)
            if k.startswith("novel-unannotated"):
                # write novel later, we are grouping them by loci first
                # tagRG='novel'
                for x in v:
                    novel_ct[x.record.sID][x.strand].insert(x.start, x.end, novel_index)
                    novel_index += 1
                    novel_list.append(x)
                continue
            elif k.startswith("novel-antisense"):
                tagRG = "novel-antisense"
            elif k.startswith("novel-partial"):
                tagRG = "novel-partial"
            elif k.startswith("poly-"):
                tagRG = "poly"
            else:
                tagRG = "single"
            v.sort(
                key=lambda x: (x.start, x.end),
                reverse=bool(v[0].strand == "-"),
            )  # sort by start, then end
            for i, x in enumerate(v):
                f.write(
                    f"{x.record.record_line}\tSN:Z:{i + 1:06d}\tRG:Z:{tagRG}\tgn:Z:{k}\n"
                )
                if x.strand == "+":
                    f1.write(
                        f"{x.record.qID}\t{tagRG}\t{k}\t{i + 1:06d}\t{x.strand}\t{x.start + 1}\t{x.end}\n"
                    )
                else:  # - strand, start is end, end is start
                    f1.write(
                        f"{x.record.qID}\t{tagRG}\t{k}\t{i + 1:06d}\t{x.strand}\t{x.end}\t{x.start + 1}\n"
                    )

        # now write the novel stuff, grouped by regions
        novel_region_index = 1
        for d1 in novel_ct.values():
            for ct in d1.values():
                gn = f"novel-{str(novel_region_index)}"
                for *_, _indices in ct.getregions():
                    v = [novel_list[ind] for ind in _indices]
                    v.sort(
                        key=lambda x: (x.start, x.end),
                        reverse=bool(v[0].strand == "-"),
                    )  # sort by start, then end
                    for i, x in enumerate(v):
                        f.write(
                            f"{x.record.record_line}\tSN:Z:{i + 1:06d}\tRG:Z:{'novel-unannotated'}\tgn:Z:{gn}\n"
                        )
                        if x.strand == "+":
                            f1.write(
                                f"{x.record.qID}\t{'novel-unannotated'}\t{gn}\t{i + 1:06d}\t{x.strand}\t{x.start + 1}\t{x.end}\n"
                            )
                        else:
                            f1.write(
                                f"{x.record.qID}\t{'novel-unannotated'}\t{gn}\t{i + 1:06d}\t{x.strand}\t{x.end}\t{x.start + 1}\n"
                            )
                    novel_region_index += 1

        logger.info(f"Output written to: {f.name}")
        logger.info(f"Output written to: {f1.name}")
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group,
                     addon_name, fuzzy_junction, allow_5merge, max_3_diff,
                     n_chunks):
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    recs = []
    tree = OrderedDict()
    i = 0
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)
    #print("# of recs: {0}, cpus: {1}, chunk_size: {2}".format(n, n_chunks, chunk_size))

    split_files = []
    i = 0
    counter = 0
    f_gff = open(addon_gff + '.split' + str(i), 'w')
    f_group = open(addon_group + '.split' + str(i), 'w')
    for v1 in tree.values():
        for strand in ('+', '-'):
            v2 = v1[strand]
            for _start, _end, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write("{0}\t{1}\n".format(
                        recs[cur].seqid,
                        ",".join(addon_group_info[recs[cur].seqid])))
                    counter += 1
            # note: becuz we are limited by how the records are organized by (chrom, strand)
            # we may not end up using all the chunks, ex: if all records are on the same locus, we end up writing everything to one split file
            if counter >= (i + 1) * chunk_size:
                i += 1
                f_gff.close()
                f_group.close()
                split_files.append((f_gff.name, f_group.name))
                if i >= n_chunks or counter >= len(recs):
                    break
                f_gff = open(addon_gff + '.split' + str(i), 'w')
                f_group = open(addon_group + '.split' + str(i), 'w')
    if not f_gff.closed:
        f_gff.close()
        f_group.close()
        split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(target=chain_helper,
                    args=(ref_gff, ref_group, split_gff, split_group, ref_name,
                          addon_name + '.' + str(i), fuzzy_junction,
                          allow_5merge, max_3_diff))
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, addon_name + '.' + str(i)))
    for p in pools:
        p.join()
    #print("split files: {0}, result_prefix: {1}".format(split_files, result_prefixes))
    return result_prefixes, split_files
Beispiel #12
0
                     exonNum+1,
                     gene.id,
                     gene.id,
                     trans_id
                 ))



################################################################################
if __name__ == '__main__':
    # build clusters
    clusterReads.c = 0
    readDict = { }
    clusterDist = 50
    clusterMembers = 1
    cluster_treesP = collections.defaultdict(lambda:ClusterTree(clusterDist, 
                                                                clusterMembers))
    cluster_treesN = collections.defaultdict(lambda:ClusterTree(clusterDist, 
                                                                clusterMembers))
    clusterReads(args.bamfile, cluster_treesP, cluster_treesN, readDict)
    keys = list(set.union(*[set(cluster_treesN.keys()), 
                            set(cluster_treesP.keys())]))
    
    # Transcript assembly
    geneIsos = collections.defaultdict(list)
    geneReads = collections.defaultdict(list)
    novelClustersN = collections.defaultdict(list)
    novelClustersP = collections.defaultdict(list)
    allIsos = []
    allClusters = [ ]
    allReads = 0
    isosP = collections.defaultdict(list)
Beispiel #13
0
def make_fake_genome(genome_filename,
                     gff_filename,
                     ref_chr,
                     ref_start,
                     ref_end,
                     ref_strand,
                     output_prefix,
                     output_name,
                     genome_d=None):
    if genome_d is None:
        print >> sys.stderr, "Reading genome file {0}...".format(
            genome_filename)
        d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), 'fasta'))
    else:
        d = genome_d

    print >> sys.stderr, "Reading GFF file {0}...".format(gff_filename)
    good = []
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        if r.chr==ref_chr and r.strand==ref_strand and \
                (ref_start <= r.start < r.end <= ref_end) \
            and len(r.ref_exons) > 1:
            print >> sys.stderr, "Adding {0} to fake genome.".format(r.seqid)
            good.append(r)

    if len(good) == 0:
        print >> sys.stderr, "Did not find any transcripts strictly within {0}:{1}-{2} on strand {3}. Abort!".format(\
            ref_chr, ref_start, ref_end, ref_strand)
        sys.exit(-1)

    c = ClusterTree(0, 0)
    for r in good:
        for e in r.ref_exons:
            c.insert(e.start - extra_bp_around_junctions,
                     e.end + extra_bp_around_junctions, 1)

    regions = [(a, b) for (a, b, junk) in c.getregions()]
    regions[0] = (regions[0][0] - __padding_before_after__, regions[0][1])
    regions[-1] = (regions[-1][0], regions[-1][1] + __padding_before_after__)

    with open(output_prefix + '.fasta', 'w') as f:
        f.write(">" + output_name + "\n")
        for a, b in regions:
            f.write(str(d[r.chr][a:b].seq))
        f.write("\n")
        f.close()

    # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome>
    with open(output_prefix + '.mapping.txt', 'w') as f:
        i = 0
        for a, b in regions:
            for j in xrange(a, b):
                f.write("{0},{1},{2}\n".format(i, ref_chr, j))
                i += 1

        with open(output_prefix + '.pbids.txt', 'w') as f:
            f.write("\n".join(r.seqid for r in good) + '\n')

    print >> sys.stderr, "Output written to {0}.fasta, {0}.mapping.txt, {0}.pbids.txt.".format(
        output_prefix)
def loc2region(li):
    clu = ClusterTree(0, 0)
    for x in li:
        clu.insert(x, x + 1, 0)
    for start, end, _ in clu.getregions():
        yield (start, end)
Beispiel #15
0
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2, blastn_filename=None):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.sam')
    reader = BioReaders.GMAPSAMReader(filename, True, \
                                      query_len_dict=dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta')))
    for r in reader:
        seq_info[r.qID].append(r)
        contigs_seen.add(r.sID)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname,'cogent2.fa.'+genome1+'.sam'))
    d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname,'cogent2.fa.'+genome2+'.sam'))

    if blastn_filename is not None:
        qlen_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')),'fasta'))
        best_of = read_blastn(os.path.join(dirname, blastn_filename), qlen_dict)

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info), len(contigs_seen)))
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(len(contig_genome1), cov1, acc1, has_chimeric1, ",".join(contig_genome1)))
    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
    f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}".format(len(contig_genome2), cov2, acc2, has_chimeric2, ",".join(contig_genome2)))
    # (for blastn, optional) best name with best e-value
    if blastn_filename is not None:
        if len(best_of) == 0: f1.write("\t0\tNA\n")
        else:
            stuff = best_of.values() # list of (e-value, name)
            stuff.sort()
            f1.write("\t{0}\t\"{1}\"\n".format(sum(_n!='NA' for _e,_n in best_of.values()), stuff[0][1]))
    else: f1.write("\n")

    in_aligned_to_genome1 = os.path.join(dirname, 'in.trimmed.fa.'+genome1+'.sam')
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'in.trimmed.fa'), in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.iteritems():
        contigs = [x.sID for x in v]
        acc = sum(x.identity*x.qCoverage for x in v)/sum(x.qCoverage for x in v)
        f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname, ",".join(contigs), acc))

        if not seqid in d3:
            f2.write("NA\t0\tNA\tNA")
            if blastn_filename is not None: f2.write("\tNA\n")
            else: f2.write("\n")
        else:
            scaffolds = [x.sID for x in d3[seqid]]
            # calculate cov and acc
            c = ClusterTree(0,0)
            for x in d3[seqid]:
                qlen = x.qLen
                c.insert(x.qStart, x.qEnd, -1)
            cov = sum(_e-_s for _s,_e,_junk in c.getregions())*100./qlen
            acc = sum(x.identity*x.qCoverage for x in d3[seqid])*1./sum(x.qCoverage for x in d3[seqid])
            f2.write("{0}\t{1}\t{2}\t{3}".format(",".join(scaffolds), len(scaffolds), cov, acc))
            if blastn_filename is not None: f2.write("\t{0}\n".format(best_of[seqid][1]))
            else: f2.write("\n")
Beispiel #16
0
def chain_split_file(
    ref_gff: Path,
    ref_group: Path,
    ref_name: str,
    addon_gff: Path,
    addon_group: Path,
    addon_name: str,
    fuzzy_junction: int,
    allow_5merge: bool,
    max_3_diff: int,
    n_chunks: int,
) -> Tuple[List[str], List[str]]:
    """
    Organize entries in both a gff and transcript group file
    and split both such that the original two files are split into chunks
    where gff.chunk.n covers the same entries as group.chunk.n
    """

    # read in the group_file as a dictionary in the form of
    # {
    #   'PB.1.1': ["transcript/1"],
    #   'PB.1.2': ["transcript/2", "transcript/3"]
    # }
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    # with addon_group.open('r') as ag:
    #     addon_group_info = {_.split('\t')[0]: _.split('\t')[1].split(",") for _ in ag.readlines()}
    recs = []
    tree = OrderedDict()
    i = 0
    # for r in HTSeq.GFF_Reader(addon_gff):
    # if r.iv.chrom not in tree2:
    #     tree[r.iv.chrom] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)}
    #     tree[r.iv.chrom][r.iv.strand].insert(r.iv.start, r.iv.end, i)
    #     recs.append(r)
    #     i += 1

    # This should build a structure in the form of:
    # {"chrN":
    #   {
    #       "+" : bx.intervals.cluster.clusterTree,
    #       "-" : bx.intervals.cluster.clusterTree,
    #   },
    # "chrN+1":
    #   {
    #       "+" : bx.intervals.cluster.clusterTree,
    #       "-" : bx.intervals.cluster.clusterTree,
    #   },
    # }
    # CusterTree objects have the form
    #   [(x,y,[z]), (a,b,[c]), (m,n,[o])]
    #   where each tuple is a range and a list of ids that lie within that range
    # e.g. (from the bx-python docs):
    # tree = ClusterTree(0, 0) Insert (6, 7, 1), (1, 2, 3), (9, 10, 2), (3, 4, 0), (3, 8, 4)
    # tree.getregions() returns [(1, 2, [3]), (3, 8, [0, 1, 4]), (9, 10, [2])]

    # NOTE: GFF.collapseGFFReader is a specialized GFF reader that in the attributes
    # field stores a list of bx.intervals.intersection.Interval objects
    # describing the exons
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {"+": ClusterTree(0, 0), "-": ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)

    split_files = []
    i = 0
    counter = 0
    f_gff = open(f"{addon_gff}.split{str(i)}", "w")
    f_group = open(f"{addon_group}.split{str(i)}", "w")
    # this loop is going to reorder everything
    # so that we have a GFF with a transcript followed by all the exons that
    # made up that transcript and a separate file with the matching
    # transcript_id transcript/read_group#
    # (see the sp.MegaPBTree above)
    for v1 in tree.values():
        for strand in ("+", "-"):
            v2 = v1[strand]
            for *_, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write(
                        f"{recs[cur].seqid}\t{','.join(addon_group_info[recs[cur].seqid])}\n"
                    )
                    counter += 1
            if counter >= (i + 1) * chunk_size:
                i += 1
                n = f_gff.tell()
                f_gff.close()
                f_group.close()
                if n == 0:  # didn't write any records, delete these
                    Path(f_gff.name).unlink()
                    Path(f_group.name).unlink()
                else:
                    split_files.append((f_gff.name, f_group.name))
                if i >= n_chunks or counter >= len(recs):
                    break
                f_gff = open(f"{addon_gff}.split{str(i)}", "w")
                f_group = open(f"{addon_group}.split{str(i)}", "w")
    if not f_gff.closed:
        n = f_gff.tell()
        f_gff.close()
        f_group.close()
        if n == 0:  # didn't write any records, delete these
            Path(f_gff.name).unlink()
            Path(f_group.name).unlink()
        else:
            split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(
            target=chain_helper,
            args=(
                ref_gff,
                ref_group,
                split_gff,
                split_group,
                ref_name,
                f"{addon_name}.{str(i)}",
                fuzzy_junction,
                allow_5merge,
                max_3_diff,
            ),
        )
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, f"{addon_name}.{str(i)}"))
    for p in pools:
        p.join()
    return result_prefixes, split_files
Beispiel #17
0
 def total_coverage(tmprecs):
     tree = ClusterTree(0, 0)
     for r in tmprecs:
         tree.insert(r.qStart, r.qEnd, -1)
     return sum(reg[1] - reg[0] for reg in tree.getregions())
def write_reclist_to_gff_n_info(
    rec_list: Dict[str, Any],
    final_prefix: str,
    ref_name: str,
    addon_name: str,
    use_fq: bool = False,
) -> Dict[str, str]:
    # now go through the rec list and figure out in what order we are outputting the total records
    tree = defaultdict(lambda: {
        "+": ClusterTree(0, 0),
        "-": ClusterTree(0, 0)
    })
    tree_keys_numeric = set()
    tree_keys_alpha = set()
    for i, match_rec in enumerate(rec_list):
        tree[match_rec.rec.chr][match_rec.rec.strand].insert(
            match_rec.rec.start, match_rec.rec.end, i)

    for chrom in tree:
        try:
            k = int(chrom)
            tree_keys_numeric.add(k)
        except ValueError:
            tree_keys_alpha.add(chrom)
    tree_keys = sorted(tree_keys_numeric) + sorted(tree_keys_alpha)

    writer_info = DictWriter(
        Path(f"{final_prefix}.mega_info.txt").open("w"),
        fieldnames=["superPBID", ref_name, addon_name],
        delimiter="\t",
    )
    writer_info.writeheader()
    if use_fq:
        f_fq = Path(f"{final_prefix}.rep.fq")
    with open(f"{final_prefix}.gff",
              "w") as f_gff, open(f"{final_prefix}.group.txt", "w") as f_group:
        new_group_info = {}

        pb_i = 0
        for _chr in tree_keys:
            for _strand in ("+", "-"):
                for *_, _indices in tree[_chr][_strand].getregions():
                    # further sort these records by (start, end, num_exons)
                    _indices.sort(key=lambda i: (
                        rec_list[i].rec.start,
                        rec_list[i].rec.end,
                        len(rec_list[i].rec.ref_exons),
                    ))
                    pb_i += 1
                    for pb_j, recs_index in enumerate(_indices):
                        pbid = f"PB.{pb_i}.{pb_j + 1}"
                        match_rec = rec_list[recs_index]
                        new_group_info[pbid] = match_rec.members
                        match_rec.rec.seqid = pbid
                        GFF.write_collapseGFF_format(f_gff, match_rec.rec)
                        writer_info.writerow({
                            "superPBID": pbid,
                            ref_name: match_rec.ref_id,
                            addon_name: match_rec.addon_id,
                        })
                        f_group.write(
                            f"{pbid}\t{','.join(match_rec.members)}\n")
                        if use_fq:
                            match_rec.seqrec.id = pbid
                            match_rec.seqrec.description = ""
                            SeqIO.write(match_rec.seqrec, f_fq, "fastq")

    return new_group_info
def make_fake_genome(
    genome_filename,
    gff_filename,
    ref_chr,
    ref_start,
    ref_end,
    ref_strand,
    output_prefix,
    output_name=None,
    genome_d=None,
):
    if genome_d is None:
        logger.info(f"Reading genome file {genome_filename}...")
        d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), "fasta"))
    else:
        d = genome_d

    if output_name is None:
        output_name = f"fake_{genome_filename}"

    logger.info(f"Reading GFF file {gff_filename}...")
    good = []
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        if (r.chr == ref_chr and r.strand == ref_strand
                and (ref_start <= r.start < r.end <= ref_end)
                and len(r.ref_exons) > 1):
            logger.info(f"Adding {r.seqid} to fake genome.")
            good.append(r)

    if len(good) == 0:
        raise RuntimeError(
            f"Did not find any transcripts strictly within {ref_chr}:{ref_start}-{ref_end} on strand {ref_strand}. Abort!"
        )

    c = ClusterTree(0, 0)
    for r in good:
        for e in r.ref_exons:
            c.insert(
                e.start - extra_bp_around_junctions,
                e.end + extra_bp_around_junctions,
                1,
            )

    regions = [(a, b) for (a, b, junk) in c.getregions()]
    regions[0] = (regions[0][0] - __padding_before_after__, regions[0][1])
    regions[-1] = (regions[-1][0], regions[-1][1] + __padding_before_after__)

    with open(output_prefix + ".fasta", "w") as f:
        f.write(">" + output_name + "\n")
        for a, b in regions:
            f.write(str(d[r.chr][a:b].seq))
        f.write("\n")

    # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome>
    with open(output_prefix + ".mapping.txt", "w") as f:
        i = 0
        for a, b in regions:
            for j in range(a, b):
                f.write(f"{i},{ref_chr},{j}\n")
                i += 1

        with open(output_prefix + ".pbids.txt", "w") as f:
            f.write("\n".join(r.seqid for r in good) + "\n")

    logger.info(
        f"Output written to {output_prefix}.fasta, {output_prefix}.mapping.txt, {output_prefix}.pbids.txt."
    )
def write_reclist_to_gff_n_info(rec_list,
                                final_prefix,
                                ref_name,
                                addon_name,
                                use_fq=False):
    # now go through the rec list and figure out in what order we are outputting the total records
    tree = defaultdict(lambda: {
        '+': ClusterTree(0, 0),
        '-': ClusterTree(0, 0)
    })
    tree_keys_numeric = set()
    tree_keys_alpha = set()
    for i, match_rec in enumerate(rec_list):
        tree[match_rec.rec.chr][match_rec.rec.strand].insert(
            match_rec.rec.start, match_rec.rec.end, i)

    for chrom in tree:
        try:
            k = int(chrom)
            tree_keys_numeric.add(k)
        except ValueError:
            tree_keys_alpha.add(chrom)
    tree_keys = sorted(list(tree_keys_numeric)) + sorted(list(tree_keys_alpha))

    f_gff = open(final_prefix + '.gff', 'w')
    f_info = open(final_prefix + '.mega_info.txt', 'w')
    writer_info = DictWriter(f_info,
                             fieldnames=['superPBID', ref_name, addon_name],
                             delimiter='\t')
    writer_info.writeheader()
    f_group = open(final_prefix + '.group.txt', 'w')
    if use_fq:
        f_fq = open(final_prefix + '.rep.fq', 'w')
    # sort the combined gff (tree) by chromosome and strand (- first)

    new_group_info = {}

    pb_i = 0

    for _chr in tree_keys:
        # remember to convert potential integer chromsomes keys back to string now that we sorted them!
        _chr = str(_chr)
        for _strand in ('+', '-'):
            for _start, _end, _indices in tree[_chr][_strand].getregions():
                # further sort these records by (start, end, num_exons)
                _indices.sort(key=lambda i: (rec_list[i].rec.start, rec_list[
                    i].rec.end, len(rec_list[i].rec.ref_exons)))
                pb_i += 1
                for pb_j, recs_index in enumerate(_indices):
                    pbid = "PB.{0}.{1}".format(pb_i, pb_j + 1)
                    match_rec = rec_list[recs_index]
                    new_group_info[pbid] = match_rec.members
                    match_rec.rec.seqid = pbid
                    GFF.write_collapseGFF_format(f_gff, match_rec.rec)
                    writer_info.writerow({
                        'superPBID': pbid,
                        ref_name: match_rec.ref_id,
                        addon_name: match_rec.addon_id
                    })
                    f_group.write("{0}\t{1}\n".format(
                        pbid, ",".join(match_rec.members)))
                    if use_fq:
                        match_rec.seqrec.id = pbid
                        match_rec.seqrec.description = ''
                        SeqIO.write(match_rec.seqrec, f_fq, 'fastq')
    f_gff.close()
    f_info.close()
    f_group.close()
    if use_fq:
        f_fq.close()
    return new_group_info
                    fout.write(
                        '%s\tprotein_coding\tCDS\t%d\t%d\t.\t%s\t.\t gene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; transcript_name "%s"; protein_id "%s";\n'
                        % (gene.chromosome, exon[0], exon[1], gene.strand,
                           gene.id, trans_id, exonNum + 1, gene.id, gene.id,
                           trans_id))


################################################################################
if __name__ == '__main__':
    # build clusters
    clusterReads.c = 0
    readDict = {}
    clusterDist = 50
    clusterMembers = 1
    cluster_treesP = collections.defaultdict(
        lambda: ClusterTree(clusterDist, clusterMembers))
    cluster_treesN = collections.defaultdict(
        lambda: ClusterTree(clusterDist, clusterMembers))
    clusterReads(args.bamfile, cluster_treesP, cluster_treesN, readDict)
    keys = list(
        set.union(*[set(cluster_treesN.keys()),
                    set(cluster_treesP.keys())]))

    # Transcript assembly
    geneIsos = collections.defaultdict(list)
    geneReads = collections.defaultdict(list)
    novelClustersN = collections.defaultdict(list)
    novelClustersP = collections.defaultdict(list)
    allIsos = []
    allClusters = []
    allReads = 0
    def write_cluster_tree_as_gff(
        self,
        cluster_tree: ClusterTree,
        rec_list: List[GFF.gmapRecord],
        group_filename2: Union[str, Path],
        sample_prefix2: str,
        output_prefix: str,
        fastq_filename2: Optional[Union[str, Path]] = None,
    ) -> Dict[str, str]:
        """
        Write ClusterTree (chr --> dict --> (start, end, rec_list_index)) as collapsedGFF format
        Returns --- a new group_info!!!
        """
        if fastq_filename2 is not None:
            fastq_dict2 = MegaPBTree.read_fastq_to_dict(fastq_filename2)
            f_fastq = Path(f"{output_prefix}.rep.fq")
        group_info2 = MegaPBTree.read_group(group_filename2, sample_prefix2)
        new_group_info = {}

        with open(f"{output_prefix}.mega_info.txt", "w") as f_mgroup:
            f_mgroup.write(f"pbid\t{self.self_prefix}\t{sample_prefix2}\n")
            fusion_index = 0
            chroms = list(cluster_tree.keys())
            chroms.sort()

            for (k) in (
                    chroms
            ):  # IMPORTANT: for fusion, this is *just* the chrom of the first record! Fusions can be multi-chrom
                for strand in ("+", "-"):
                    for *_, rec_indices in cluster_tree[k][strand].getregions(
                    ):
                        for i in rec_indices:
                            fusion_index += 1
                            tID = f"PBfusion.{fusion_index}"
                            r1s, r2s = rec_list[i]
                            if r1s is None:  # r2s is not None
                                recs = r2s
                                r2_fusion_id = get_fusion_id(r2s[0].seqid)
                                new_group_info[tID] = group_info2[r2_fusion_id]
                                f_mgroup.write(f"{tID}\tNA\t{r2_fusion_id}\n")
                                if fastq_filename2 is not None:
                                    seqrec = fastq_dict2[r2_fusion_id]
                            elif r2s is None:  # r1 is not None
                                recs = r1s
                                r1_fusion_id = get_fusion_id(r1s[0].seqid)
                                new_group_info[tID] = self.group_info[
                                    r1_fusion_id]
                                f_mgroup.write(f"{tID}\t{r1_fusion_id}\tNA\n")
                                if fastq_filename2 is not None:
                                    seqrec = self.fastq_dict[r1_fusion_id]
                            else:  # both r1, r2 are not empty
                                r1_fusion_id = get_fusion_id(r1s[0].seqid)
                                r2_fusion_id = get_fusion_id(r2s[0].seqid)
                                r1_len = sum(x.end - x.start for x in r1s)
                                r2_len = sum(x.end - x.start for x in r2s)
                                if r1_len > r2_len:
                                    recs = r1s
                                    if fastq_filename2 is not None:
                                        seqrec = self.fastq_dict[r1_fusion_id]
                                else:
                                    recs = r2s
                                    if fastq_filename2 is not None:
                                        seqrec = fastq_dict2[r2_fusion_id]
                                new_group_info[tID] = (
                                    self.group_info[r1_fusion_id] +
                                    group_info2[r2_fusion_id])
                                f_mgroup.write(
                                    f"{tID}\t{r1_fusion_id}\t{r2_fusion_id}\n")

                            if fastq_filename2 is not None:
                                seqrec.id = tID
                                SeqIO.write(seqrec, open(f_fastq, "w"),
                                            "fastq")

                            with open(f"{output_prefix}.group.txt",
                                      "w") as f_group:
                                f_group.write(
                                    f"{tID}\t{','.join(new_group_info[tID])}\n"
                                )

                            with open(f"{output_prefix}.gff", "w") as f_out:
                                # now write out the fusion transcript
                                for j, r in enumerate(recs):
                                    f_out.write(
                                        f'{r.chr}\tPacBio\ttranscript\t{r.start + 1}\t{r.end}\t.\t{strand}\t.\tgene_id "{tID}"; transcript_id "{tID}.{j + 1}";\n'
                                    )
                                    for exon in r.ref_exons:
                                        f_out.write(
                                            f'{r.chr}\tPacBio\texon\t{exon.start + 1}\t{exon.end}\t.\t{strand}\t.\tgene_id "{tID}"; transcript_id "{tID}.{j + 1}";\n'
                                        )
        return new_group_info
Beispiel #23
0
            else:
                reg_cov[line[2]] = [int(line[3])]
        except:
            continue
    infh.close()
    return reg_cov

if __name__ == '__main__':
    
    try:
        repeat_file_1 = sys.argv[1]
        alignment_file = sys.argv[2]
    except:
        print 'Provide repeat region file in WIG format, Alignment file in SAM format'
        sys.exit(-1)

    cluster_distance = 1 
    repeat_regions_50 = collections.defaultdict(lambda:ClusterTree(cluster_distance, 2))

    repeat_generator = repeat_parse(repeat_file_1)
    for match_id, start, end, score in repeat_generator:
        repeat_regions_50[match_id].insert(start, end, score)
    #print 'Number of clusters: ' + str(len(repeat_regions_50))
    location_db = sam_reader(alignment_file)
    repeat_cnt = 0
    for chrom, cluster_tree in repeat_regions_50.items():
        for start, end, scores in cluster_tree.getregions():
            for rloc in location_db[chrom]:
                if (rloc >= start and rloc <= end) or (rloc+80 >= start and rloc+80 <= end):repeat_cnt += 1
    print repeat_cnt 
def categorize_aln_by_annotation(gene_annotation_file,
                                 input_fasta,
                                 input_sam,
                                 output_prefix,
                                 min_overlap_bp=200,
                                 min_query_overlap=.5,
                                 min_gene_overlap=.8):

    t = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr -> strand -> IntervalTree
    info = {}

    #reader = DictReader(open('ProteinTable149_154224.txt'),delimiter='\t')
    for r in DictReader(open(gene_annotation_file), delimiter='\t'):
        if r['#Replicon Name'] != 'chr':
            print("Ignore", r, file=sys.stderr)
            continue
        info[r['Locus tag']] = (int(r['Start']), int(r['Stop']),
                                r['Locus tag'])
        t[r['Replicon Accession']][r['Strand']].add(int(r['Start']),
                                                    int(r['Stop']),
                                                    r['Locus tag'])

    #pdb.set_trace()

    result = defaultdict(lambda: [])  # gene -> list of rec
    d = dict(
        (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), 'fasta'))

    reader = BioReaders.GMAPSAMReader(input_sam, True, query_len_dict=d)
    for r in reader:
        #if r.qID == 'm151125_055539_42275_c100921822550000001823204305121656_s1_p0/121461/30_2108_CCS':
        #    pdb.set_trace()
        ans = match_w_annotation(t, r, info, min_overlap_bp, min_query_overlap,
                                 min_gene_overlap)
        # ans is AMatch(name, strand, start, end, record)
        result[ans.name].append(ans)

    novel_ct = defaultdict(lambda: {
        '+': ClusterTree(0, 0),
        '-': ClusterTree(0, 0)
    })
    novel_list = []
    novel_index = 0

    f = open(output_prefix + '.sam', 'w')
    f.write(reader.header)
    f1 = open(output_prefix + '.report.txt', 'w')
    f1.write("id\tread_group\tgene_name\tserial_number\tstrand\tstart\tend\n")
    for k, v in result.items():
        # v is: list of AMatch(name, strand, start, end, record)
        if k.startswith('novel-unannotated'):
            # write novel later, we are grouping them by loci first
            #tagRG='novel'
            for x in v:
                novel_ct[x.record.sID][x.strand].insert(
                    x.start, x.end, novel_index)
                novel_index += 1
                novel_list.append(x)
            continue
        elif k.startswith('novel-antisense'):
            tagRG = 'novel-antisense'
        elif k.startswith('novel-partial'):
            tagRG = 'novel-partial'
        elif k.startswith('poly-'):
            tagRG = 'poly'
        else:
            tagRG = 'single'
        v.sort(key=lambda x: (x.start, x.end),
               reverse=True
               if v[0].strand == '-' else False)  # sort by start, then end
        for i, x in enumerate(v):
            f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format(
                x.record.record_line, i + 1, tagRG, k))
            if x.strand == '+':
                f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                    x.record.qID, tagRG, k, i+1, x.strand, x.start+1, x.end))
            else:  # - strand, start is end, end is start
                f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                    x.record.qID, tagRG, k, i+1, x.strand, x.end, x.start+1))

    # now write the novel stuff, grouped by regions
    novel_region_index = 1
    for d1 in novel_ct.values():
        for ct in d1.values():
            gn = 'novel-' + str(novel_region_index)
            for _start, _end, _indices in ct.getregions():
                v = [novel_list[ind] for ind in _indices]
                v.sort(key=lambda x: (x.start, x.end),
                       reverse=True if v[0].strand == '-' else
                       False)  # sort by start, then end
                for i, x in enumerate(v):
                    f.write("{0}\tSN:Z:{1:06d}\tRG:Z:{2}\tgn:Z:{3}\n".format(
                        x.record.record_line, i + 1, "novel-unannotated", gn))
                    if x.strand == '+':
                        f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                            x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.start+1, x.end))
                    else:
                        f1.write("{0}\t{1}\t{2}\t{3:06d}\t{4}\t{5}\t{6}\n".format(\
                            x.record.qID, "novel-unannotated", gn, i+1, x.strand, x.end, x.start+1))
                novel_region_index += 1

    f.close()
    f1.close()

    print("Output written to:", f.name, file=sys.stderr)
    print("Output written to:", f1.name, file=sys.stderr)
    'locus_downstream_boundary'] = missing_from_collection.plink_ld_partners.apply(
        find_ld_partner, args=('loci_downstream', ))
results_snps_df.update(missing_from_collection)
results_snps_df['chr'] = results_snps_df['chr'].astype('int')
results_snps_df['pos'] = results_snps_df['pos'].astype('int')
results_snps_df['locus_downstream_boundary'] = results_snps_df[
    'locus_downstream_boundary'].astype('int')
results_snps_df['locus_upstream_boundary'] = results_snps_df[
    'locus_upstream_boundary'].astype('int')

####### Merge associated loci within 250 kb of each other.

trees = {}
min_intervals = 0
for i in range(1, 23, 1):
    trees[i] = ClusterTree(merging_distance_kb, min_intervals)

for i, (index, row) in enumerate(results_snps_df.iterrows()):
    if row.chr in range(1, 23, 1):
        trees[row.chr].insert(row.locus_upstream_boundary,
                              row.locus_downstream_boundary, i)

results_loci_df = pd.DataFrame(columns=[
    'snp_name', 'chr', 'pos', 'pvalue', 'locus_upstream_boundary',
    'locus_downstream_boundary'
])
results_snps_df['locus'] = None
counter = 0
for chrom in trees:
    for (start, end, loci) in trees[chrom].getregions():
Beispiel #26
0
 def total_coverage(tmprecs):
     tree = ClusterTree(0, 0)
     for r in tmprecs: tree.insert(r.qStart, r.qEnd, -1)
     return sum(reg[1]-reg[0] for reg in tree.getregions())
Beispiel #27
0
if __name__ == '__main__':
    
    try:
        alignment_file = sys.argv[1]
        cluster_name = sys.argv[2]
        cluster_distance = int(sys.argv[3])
    except:
        print __doc__
        sys.exit(-1)

    # - Distance in basepairs for two reads to be in the same cluster;
    #   for instance 20 would group all reads with 20bp of each other
    # - Number of reads necessary for a group to be considered a cluster;
    #   100 returns all groups with 100 or more overlapping reads
    cluster_trees = collections.defaultdict(lambda:ClusterTree(cluster_distance, 100))

    read_id_map, cnt, location_db, freq_db = dict(), 0, dict(), dict()
    align_generator = alignment_parse(alignment_file, cluster_name)

    for read_id, match_id, start, end in align_generator:

        if not read_id in read_id_map: #make read id compact 
            cnt +=1 
            read_id_map[read_id] = cnt

        cluster_trees[match_id].insert(start, end, read_id_map[read_id])

        location_db[read_id_map[read_id]] = start, end #reads location 

        if read_id_map[read_id] in freq_db: #multiple alignments 
Beispiel #28
0
def tally_for_a_Cogent_dir(dirname,
                           writer1,
                           writer2,
                           genome1,
                           genome2=None,
                           blastn_filename=None):
    """
    1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff)
    2. read cogent2 mapped to genome1
    3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1)
    """
    if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')):
        return
    seq_info = defaultdict(lambda: [])
    contigs_seen = set()
    # input mapped to Cogent contigs
    filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.sam')
    reader = BioReaders.GMAPSAMReader(filename, True, \
                                      query_len_dict=dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta')))
    for r in reader:
        seq_info[r.qID].append(r)
        contigs_seen.add(r.sID)
    # sanity check that all sequences in in.fa are mapped to cogent2.fa
    for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'):
        assert r.id in seq_info

    d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_sam(
        os.path.join(dirname, 'cogent2.fa'),
        os.path.join(dirname, 'cogent2.fa.' + genome1 + '.sam'))
    if genome2 is not None:
        d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_sam(
            os.path.join(dirname, 'cogent2.fa'),
            os.path.join(dirname, 'cogent2.fa.' + genome2 + '.sam'))

    if blastn_filename is not None:
        qlen_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(
            open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta'))
        best_of = read_blastn(os.path.join(dirname, blastn_filename),
                              qlen_dict)

    # write:
    # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden
    cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1)
    rec1 = {
        'gene_family': dirname,
        'input_size': len(seq_info),
        'num_Cogent_contigs': len(contigs_seen),
        'num_genome_contig': len(contig_genome1),
        'genome_cov': "{0:.2f}".format(cov1),
        'genome_acc': "{0:.2f}".format(acc1),
        'genome_chimeric': has_chimeric1,
        'genome_contigs': ",".join(contig_genome1)
    }

    # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs
    if genome2 is not None:
        cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2)
        rec1['num_genome2_contig'] = len(contig_genome2)
        rec1['genome2_cov'] = "{0:.2f}".format(cov2)
        rec1['genome2_acc'] = "{0:.2f}".format(acc2)
        rec1['genome2_chimeric'] = has_chimeric2
        rec1['genome2_contigs'] = ",".join(contig_genome2)
    # (for blastn, optional) best name with best e-value
    if blastn_filename is not None:
        if len(best_of) == 0:
            rec1['num_blastn'] = 0
            rec1['blastn_best'] = 'NA'
        else:
            stuff = list(best_of.values())  # list of (e-value, name)
            stuff.sort()
            rec1['num_blastn'] = sum(_n != 'NA'
                                     for _e, _n in list(best_of.values()))
            rec1['blastn_best'] = '"' + stuff[0][1] + '"'
    writer1.writerow(rec1)

    in_aligned_to_genome1 = os.path.join(dirname,
                                         'in.trimmed.fa.' + genome1 + '.sam')
    if os.path.exists(in_aligned_to_genome1):
        d3, junk = read_cogent2_aligned_to_genome_sam(
            os.path.join(dirname, 'in.trimmed.fa'), in_aligned_to_genome1)
    else:
        d3 = {}

    for seqid, v in seq_info.items():
        contigs = [x.sID for x in v]
        acc = sum(x.identity * x.qCoverage for x in v) / sum(x.qCoverage
                                                             for x in v)

        rec2 = {
            'seqid': seqid,
            'gene_family': dirname,
            'Cogent_contig': ",".join(contigs),
            'Cogent_contig_acc': acc
        }

        if not seqid in d3:
            rec2['scaffold'] = 'NA'
            rec2['num_scaffold'] = 0
            rec2['scaffold_coverage'] = 'NA'
            rec2['scaffold_acc'] = 'NA'
            if blastn_filename is not None:
                rec2['blastn_best'] = 'NA'
        else:
            scaffolds = [x.sID for x in d3[seqid]]
            # calculate cov and acc
            c = ClusterTree(0, 0)
            for x in d3[seqid]:
                qlen = x.qLen
                c.insert(x.qStart, x.qEnd, -1)
            cov = sum(_e - _s
                      for _s, _e, _junk in c.getregions()) * 100. / qlen
            acc = sum(x.identity * x.qCoverage
                      for x in d3[seqid]) * 1. / sum(x.qCoverage
                                                     for x in d3[seqid])
            rec2['scaffold'] = ",".join(scaffolds)
            rec2['num_scaffold'] = len(scaffolds)
            rec2['scaffold_coverage'] = cov
            rec2['scaffold_acc'] = acc
            if blastn_filename is not None:
                rec2['blastn_best'] = best_of[seqid][1]
        writer2.writerow(rec2)
Beispiel #29
0
def main():
    args = get_args()
    if args.files:
        files = [
            f for f in glob.glob(os.path.join(args.bed, '*.bed'))
            if f in args.files
        ]
        assert len(files) == len(
            args.files), "You have specified files that are not in {0}".format(
                args.bed)
    else:
        files = glob.glob(os.path.join(args.bed, '*.bed'))
    #pdb.set_trace()
    for f in files:
        # setup output files
        fname = os.path.splitext(os.path.basename(f))[0]
        print "Processing: {0}".format(fname)
        outfname = "{0}.json".format(fname)
        outfile = open(os.path.join(args.outdir, outfname), 'w')
        # outdata
        outdata = {}
        # setup our cluster tree, a name dict, and a counter
        cluster_trees = defaultdict(lambda: ClusterTree(500, 2))
        name_map = {}
        name_counter = Counter()
        for line in open(f, 'rU'):
            if not line.startswith('track'):
                chromo, start, end, name = line.split("\t")[:4]
                start, end = int(start), int(end)
                # convert the BED name to a dict, indexed by unique db id pkey
                temp_dict = get_dict_from_name(name)
                key = int(temp_dict['probes-id'])
                # create a counter of names, so we can check for dupe hits
                name_counter.update([key])
                name_map[key] = temp_dict
                cluster_trees[chromo].insert(start, end, key)
        duplicate_hits = check_for_dupe_hits(name_counter)
        if duplicate_hits:
            outdata['Duplicates'] = True
        else:
            outdata['Duplicates'] = False
        outdata['Overlaps'] = defaultdict(list)
        for chromo in cluster_trees:
            overlaps = cluster_trees[chromo].getregions()
            if overlaps:
                for span in overlaps:
                    # get distinct list of *loci* (not probes) hit for a given
                    # region.
                    loci = list(
                        set([
                            int(name_map[probe]['probes-locus'])
                            for probe in span[2]
                        ]))
                    if len(span[2]) > 1:
                        outdata['Overlaps'][chromo].append({
                            'start': span[0],
                            'end': span[1],
                            'loci': loci,
                            'probes': span[2]
                        })
        json.dump(outdata, outfile, indent=2)