def main(): if len(sys.argv) < 4: print >> sys.stderr, \ 'Usage: assemgraph2.py <query file> <target file>' + \ ' <blast9 file> [min score=200]' raise SystemExit try: MIN_SCORE = float(sys.argv[5]) except IndexError: pass print >> sys.stderr, 'Reading sequence databases...' queries = seqdb.SequenceFileDB(sys.argv[1]) targets = seqdb.SequenceFileDB(sys.argv[2]) print >> sys.stderr, len(queries), len(targets) try: align_file = open(sys.argv[3]) except IOError as e: print >> sys.stderr, 'Error: check alignment file.' raise e print >> sys.stderr, 'Constructing alignment graphs...' graph = nx.Graph() for c, (query, target) in enumerate(parse_alignments(align_file)): graph.add_edge(query, target) if c % 100 == 0: print >> sys.stderr, '...', c # nx.draw(graph) # plt.show() # print graph.nodes() logfile = open('assemgraph.log', 'w') visited_nodes = set() cluster_no = 0 for node in graph.nodes(): if node not in visited_nodes: filename1 = 'cluster_%d_targets' % cluster_no filename2 = 'cluster_%d_queries' % cluster_no ofile1 = open(filename1, 'w') ofile2 = open(filename2, 'w') print >> sys.stderr, \ 'Writing cluster %d to a file...' % cluster_no vnodes, max_length = (write_sequence(node, graph, targets, queries, ofile1, ofile2)) visited_nodes.update(vnodes) for n in vnodes: size = len(targets[n]) if n in targets else len(queries[n]) print >> logfile, 'cluster_%d\t%s\t%d' % (cluster_no, n, size) ofile1.close() ofile2.close() print >> sys.stderr, 'total nodes = %d' % len(vnodes) cluster_no += 1 print >> logfile, '***finished***' logfile.close()
def main(): try: input_file = sys.argv[1] fasta_file = sys.argv[2] except IndexError: print >> sys.stderr, \ 'unmapped_seq2.py <text file> <fasta file> [min length=50]' try: min_length = int(sys.argv[3]) except IndexError: min_length = 50 db = seqdb.SequenceFileDB(fasta_file) input_sequences = set() print >> sys.stderr, 'Reading sequences...', for line in open(input_file): input_sequences.add(line.strip()) print >> sys.stderr, 'total %d' % len(input_sequences) print >> sys.stderr, 'Writing unmapped sequences...' for seq in db: sequence = db[seq] if (seq not in input_sequences and len(sequence) >= min_length): print >> sys.stderr, 'Writing %s, %d bp' % (seq, len(sequence)) sequtil.write_fasta(sys.stdout, str(sequence), id=seq)
def pygr_slice(fname): fasta = seqdb.SequenceFileDB(fname) for rec in fasta: seq = fasta[rec] for i in range(NSLICE): sub = str(seq[:100]) break
def parse_motif(): peak_dict = count_peak() genome = seqdb.SequenceFileDB('/u/home/f/frankwoe/nobackup/hg19/hg19.noRand.fa') # motifs include: GGACT, RRACT, DRACH motif_list = ["(GGACT)", "([GA][GA]ACT)", "([AGT][AG]AC[ACT])"] with gzip.GzipFile('peak_sum.tsv.gz', 'wb') as f: # header line f.write("\t".join( [ 'peak', 'exp_num', 'exp', 'occurence_num', 'occurence', 'fc', 'GGACT,RRACT,DRACH', 'seq' ] ) + '\n') for peak in peak_dict: chrom, start, end, strand = peak.split(':') peak_seq = _fetch_seq(genome, chrom, start, end, strand) occurence = peak_dict[peak].keys() occured_exp = list(set([x.split('.')[1] for x in occurence ])) fc = [ str(peak_dict[peak][x]) for x in occurence ] motif_count = _match_motif(peak_seq, motif_list) f.write('\t'.join( [ peak, str(len(occured_exp)), ','.join(occured_exp), str(len(occurence)), ','.join(occurence), ','.join(fc), ','.join(motif_count), peak_seq ]) + '\n' )
def pygr_reverse_comp(fname): fasta = seqdb.SequenceFileDB(fname) keys = fasta.keys() keys.sort() for rec in keys: # force full reverse complement seq = str(-fasta[rec]) sub = seq[:10]
def setUp(self): genomeFile = '/Users/Likit/projects/mdv/data/chick.fa' self.genome = seqdb.SequenceFileDB(genomeFile, verbose=False) exons = [('chr1', 51035309, 51035430), ('chr1', 51062489, 51062516)] self.isoform = genebuilder.Isoform('chr1', '1', '0', exons, self.genome)
def main(): '''Main function''' genome = seqdb.SequenceFileDB(sys.argv[1]) # genome sequence try: infile = open(sys.argv[2]) # splice sites file from gimme/compare_junction.py except IndexError, IOError: # no input file given or cannot open the file, # read data from stdin instead infile = sys.stdin
def bed2pygr(dbprefix, referencefile, bedfile, indir): collision_counter = defaultdict(int) chrdb = seqdb.SequenceFileDB(referencefile) annodb = annotation.AnnotationDB({}, chrdb) al = cnestedlist.NLMSA(dbprefix, 'w', pairwiseMode=True) load_bed(al, annodb, bedfile, collision_counter) al.build(saveSeqDict=True) genomeprefix = os.path.basename(referencefile).rsplit('.', 1)[0] print >> open(os.path.join(dbprefix) + '.genome', 'w'), genomeprefix
def read_genbank_annots(gbfile, fastafile=None, featureType='CDS', geneQualifier='gene'): '''construct annotation DB for gene CDS intervals. NB: this assumes each gene consists of ONE interval. This cannot be used for multi-exon genes!''' try: gbparse = SeqIO.parse(gbfile, 'genbank') except TypeError: # SeqIO changed its interface? ifile = open(gbfile) try: gbparse = SeqIO.parse(ifile, 'genbank') gbseqs = list(gbparse) finally: ifile.close() else: gbseqs = list(gbparse) if fastafile is None: fastafile = gbfile.split('.')[0] + '.fna' genome = seqdb.SequenceFileDB(fastafile) genomeIndex = blast.BlastIDIndex(genome) # handle NCBI ID blobs properly annodb = annotation.AnnotationDB({}, genome, sliceAttrDict=dict(id=0, start=1, stop=2, orientation=3)) i = 0 for s in gbseqs: seqID = genomeIndex[s.id].id # find the right seq and get its actual ID for f in s.features: if f.type == featureType: try: name = f.qualifiers[geneQualifier][0] except KeyError: # keep the annotation even if label missing warnings.warn('Missing gene qualifier "%s" on %s annotation' % (geneQualifier, featureType)) name = 'unlabeled_%s_%d' % (featureType, i) i += 1 annodb.new_annotation(name, (seqID, f.location.start.position, f.location.end.position, f.strand)) al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True) for a in annodb.itervalues(): al.addAnnotation(a) al.build() return annodb, al, genome
''' import sys from pygr import seqdb, sequtil if len(sys.argv) < 4: print >> sys.stderr, \ 'Usage: split_sequence.py fasta_file chunk_size overlap_size' raise SystemExit input_file = sys.argv[1] chunk_size = int(sys.argv[2]) overlap_size = int(sys.argv[3]) db = seqdb.SequenceFileDB(input_file) for seq in db: window = 0 print >> sys.stderr, 'Splitting %s...' % (seq) if len(db[seq]) <= chunk_size: sequtil.write_fasta(sys.stdout, str(db[seq]), id=seq) else: seq = db[seq] _id = 1 chunk_id = "%s_%d" % (seq.id, _id) while window < len(seq): chunk = seq[window:window + chunk_size] sequtil.write_fasta(sys.stdout, str(chunk), id=chunk_id) _id += 1 window += (chunk_size - overlap_size)
def main(): if len(sys.argv) < 4: raise SystemExit try: MIN_SCORE = float(sys.argv[5]) except IndexError: pass print >> sys.stderr, 'Reading sequence databases...' queries = seqdb.SequenceFileDB(sys.argv[1]) targets = seqdb.SequenceFileDB(sys.argv[2]) print >> sys.stderr, len(queries), len(targets) try: align_file = open(sys.argv[3]) except IOError as e: print >> sys.stderr, 'Error: check alignment file.' raise e aligndb = cnestedlist.NLMSA('alignment', mode='memory', pairwiseMode=True) print >> sys.stderr, 'Adding sequences to an alignment database...' # for n, target in enumerate(targets): # aligndb += targets[target] # if n % 1000 == 0: print >> sys.stderr, '...', n target_list = set() for c, al in enumerate(parse_alignments(align_file)): aligndb += targets[al.target] target_list.add(al.target) add_alignment(aligndb, al, targets, queries) if c % 100 == 0: print >> sys.stderr, '...', c print >> sys.stderr, 'Building the alignment database...' aligndb.build() print >> sys.stderr, 'Constructing alignment graphs...' graph = nx.Graph() for c, target in enumerate(target_list): try: sub_ival = targets[target] for src, dest, edge in aligndb[sub_ival].edges(): source = repr(src).split('[')[0].lstrip('-') destination = repr(dest).split('[')[0].lstrip('-') graph.add_edge(source, destination) except KeyError: pass if c % 100 == 0: print >> sys.stderr, '...', c # nx.draw(graph) # plt.show() # print graph.nodes() logfile = open('assemgraph.log', 'w') visited_nodes = set() cluster_no = 0 for node in graph.nodes(): if node not in visited_nodes: filename1 = 'cluster_%d_targets' % cluster_no filename2 = 'cluster_%d_queries' % cluster_no ofile1 = open(filename1, 'w') ofile2 = open(filename2, 'w') print >> sys.stderr, \ 'Writing cluster %d to a file...' % cluster_no, vnodes, max_length = (write_sequence(node, graph, targets, queries, ofile1, ofile2)) visited_nodes.update(vnodes) for n in vnodes: size = len(targets[n]) if n in targets else len(queries[n]) print >> logfile, 'cluster_%d\t%s\t%d' % (cluster_no, n, size) ofile1.close() ofile2.close() print >> sys.stderr, '\ttotal nodes = %d' % len(vnodes) cluster_no += 1 print >> logfile, '***finished***' logfile.close()
def read_genome(fn): '''the input of this function should be a FASTA file''' genome = seqdb.SequenceFileDB(fn) return genome
'''Get a part of a target sequence that aligned to a given query sequence.''' import sys import csv from pygr import seqdb, sequtil psl_file = sys.argv[1] genome_file = sys.argv[2] genome = seqdb.SequenceFileDB(genome_file) reader = csv.reader(open(psl_file), dialect='excel-tab') for cols in reader: target = cols[13] start = int(cols[15]) end = int(cols[16]) seq = genome[target][start:end] seqid = target + '_' + cols[9] sequtil.write_fasta(sys.stdout, str(seq), id=seqid)
(o, args) = opts.parse_args() GC_width = int(o.gc_width) if(o.pygr_seq != None): if(o.pygr_seq.upper() == "HG18"): seqs = pygr.Data.Bio.Seq.Genome.Human.hg18() elif(o.pygr_seq.upper() == "HG19"): seqs = pygr.Data.Bio.Seq.Genome.Human.hg19() elif(o.pygr_seq.upper() == "CHIMPY"): seqs = pygr.Data.Bio.Seq.Genome.chimp.chrY() elif(o.pygr_seq.upper() == "BACS" or o.pygr_seq.upper() == "CONTROL_BACS"): seqs = pygr.Data.Bio.Seq.Genome.Human.control_bacs() elif(o.fn_fastq_seq!=None): seqs = seqdb.SequenceFileDB(o.fn_fastq_seq) else: print("no sequence file input... exiting") sys.exit(1) GC_content = {} grp = "GC_content" GC_DT = create_gc_DenseTrackSet(o.fnoutTable,o.fnContigLengths,grp,o.overwrite) for contig in seqs: if(contig in GC_DT[grp]): print("loading %s..."%(contig)) #if(chr=="chr21"): #GC_content["chr21"] = get_chr_correction(hg18,"chr21",GC_width)
def pygr_parse_fasta(fname): fasta = seqdb.SequenceFileDB(fname)
def main(): infile = sys.argv[1] genome = seqdb.SequenceFileDB(sys.argv[2]) get_sequence(infile, genome)
def pygr_iter(fname): fasta = seqdb.SequenceFileDB(fname) for rec in fasta: seq = fasta[rec]
def main(options, args): exons = {} clusters = {} newClusterID = 0 clusterConnections = {} linkedExons = {} exonPositions = {} endExons = {} singleton = 0 print >> sys.stderr, 'Minimum UTR length = ', options.minimumUTRLength print >> sys.stderr, 'Parsing and clustering exons..' for n, alnObj in enumerate(psl_parser.read(open(options.infile), 'track')): tStarts = alnObj.attrib['tStarts'] blockSizes = alnObj.attrib['blockSizes'] if len(blockSizes) == 1: singleton += 1 tName = alnObj.attrib['tName'] newClusterID = construct(tName, tStarts, blockSizes, exons, clusters, newClusterID, clusterConnections, linkedExons, exonPositions, endExons) if n % 1000 == 0: print >> sys.stderr, '...', n print >> sys.stderr, 'Total singletons = ', singleton sumExons = {} for ref, end in exons: try: sumExons[ref] += 1 except KeyError: sumExons[ref] = 1 for ref in sorted(sumExons): print >> sys.stderr, '\t%s has %d exon(s).' % (ref, sumExons[ref]) print >> sys.stderr, '\nTotal %d cluster(s) found.' % len(clusters) print >> sys.stderr, '\nMerging clusters..' mergedClusters = mergeClusters(clusters, clusterConnections) print >> sys.stderr, '\nCleaning up..' ignored = set([]) for cl in mergedClusters: allExons = mergedClusters[cl] cleanUpLinkedExons(allExons, linkedExons, exonPositions, ignored, options.minimumUTRLength) print >> sys.stderr, 'Modifying the right end of each transcript..' for cl in mergedClusters: findLongestEnd(mergedClusters[cl], linkedExons, endExons, exonPositions, ignored) print >> sys.stderr, '\nConstructing transcripts..' allPaths = {} visited = set([]) for n, cl in enumerate(mergedClusters): txExons = sorted(mergedClusters[cl]) paths = buildPaths(linkedExons, txExons, allPaths, ignored, visited) allPaths[cl] = paths if n % 1000 == 0: if n > 0: print >> sys.stderr, '... %d built..' % n genome = seqdb.SequenceFileDB(options.genome, verbose=False) '''Create isoform objects from allPaths and search for ORF. ''' print >> sys.stderr, '\nBuilding gene models..' allGenes = {} n = 0 for chrom, geneID in allPaths: n += 1 isoformID = 0 for isoExons in allPaths[(chrom, geneID)]: isoform = Isoform(chrom, geneID, isoformID, isoExons, genome) if chrom not in allGenes: allGenes[chrom] = {} allGenes[chrom][geneID] = [isoform] else: try: allGenes[chrom][geneID].append(isoform) except KeyError: allGenes[chrom][geneID] = [isoform] isoformID += 1 if n % 1000 == 0: print >> sys.stderr, '...', n print >> sys.stderr, '\nRemoving redundant sequences..' findRedundantSequence(allGenes) '''Creating sequence records for each DNA, RNA and protein sequences.''' isoformDNASeqs = [] isoformProteinSeqs = [] isoformRNASeqs = [] totalGenes = 0 for chrom in allGenes: for geneID in allGenes[chrom]: totalGenes += 1 isoformID = 0 for isoform in allGenes[chrom][geneID]: if not isoform.redundant: isoform.isoformID = isoformID isoformName = '%s:%d.%d' % (chrom, geneID, isoform.isoformID) DNARecord = SeqRecord(isoform.dnaSeq, id=isoformName) isoformDNASeqs.append(DNARecord) '''Search for ORF for non-redundant sequences''' print >> sys.stderr, 'searching ORF: %s:%d.%d' \ % (chrom, geneID,isoformID) findORF(isoform) if isoform.frame: proteinRecord = SeqRecord(isoform.proteinSeq, id=isoformName) RNARecord = SeqRecord(isoform.mrnaSeq, id=isoformName) isoformProteinSeqs.append(proteinRecord) isoformRNASeqs.append(RNARecord) isoformID += 1 if n > 0 and n % 1000 == 0: print >> sys.stderr, '...', n, 'transcripts done.' print >> sys.stderr, 'Total genes = %d\n\n', totalGenes print >> sys.stderr, 'Writing gene models to file...' writeBEDFile(allGenes, options.basename) print >> sys.stderr, 'Writing DNA sequences to file...' SeqIO.write(isoformDNASeqs, options.basename + '.dnas.fa', 'fasta') print >> sys.stderr, 'Writing RNA sequences to file...' SeqIO.write(isoformRNASeqs, options.basename + '.mrnas.fa', 'fasta') print >> sys.stderr, 'Writing protein sequences to file...' SeqIO.write(isoformProteinSeqs, options.basename + '.proteins.fa', 'fasta')
def setUp(self): hbb1_mouse = testutil.datafile('hbb1_mouse.fa') self.dna = seqdb.SequenceFileDB(hbb1_mouse) self.tdb = translationDB.get_translation_db(self.dna)