def main(): try: input_file = sys.argv[1] fasta_file = sys.argv[2] except IndexError: print >> sys.stderr, \ 'unmapped_seq2.py <text file> <fasta file> [min length=50]' try: min_length = int(sys.argv[3]) except IndexError: min_length = 50 db = seqdb.SequenceFileDB(fasta_file) input_sequences = set() print >> sys.stderr, 'Reading sequences...', for line in open(input_file): input_sequences.add(line.strip()) print >> sys.stderr, 'total %d' % len(input_sequences) print >> sys.stderr, 'Writing unmapped sequences...' for seq in db: sequence = db[seq] if (seq not in input_sequences and len(sequence) >= min_length): print >> sys.stderr, 'Writing %s, %d bp' % (seq, len(sequence)) sequtil.write_fasta(sys.stdout, str(sequence), id=seq)
def write_seq(filename, genome, output, strand): reader = csv.reader(open(filename), dialect='excel-tab') for n, line in enumerate(reader, start=1): chrom = line[0] chrom_start = int(line[1]) gene_id = line[3] exon_starts = [int(start) + \ chrom_start for start in line[-1].split(',')] exon_sizes = [int(size) for size in line[-2].split(',')] exon_ends = [exon_starts[i] + \ exon_sizes[i] for i in range(len(exon_starts))] exons = [Exon(chrom, exon_starts[i], exon_ends[i]) for i in \ range(len(exon_starts))] strand = 'negative' if line[5] == '-' else 'positive' if output == 'transcript': seq = get_sequence_transcript(genome, exons, strand) sequtil.write_fasta(sys.stdout, seq, id=gene_id) elif output == 'exon': seqs = get_sequence_exon(genome, exons, strand) for n, seq in enumerate(seqs, start=1): seq_id = gene_id + '_' + str(n) sequtil.write_fasta(sys.stdout, seq, id=seq_id) else: print >> sys.stderr, 'Unsupported output format.' raise SystemExit if n % 1000 == 0: print >> sys.stderr, '...', n
def main(): filename = sys.argv[1] genome = seqdb.SequenceFileDB(sys.argv[2], verbose=False) for n, (exons, gene_id) in enumerate( parse_seq(filename, genome), start=1): seq = get_sequence(genome, exons) sequtil.write_fasta(sys.stdout, seq, id=gene_id) if n % 1000 == 0: print >> sys.stderr, '...', n
def write_sequence(source, graph, targets, queries, ofile1, ofile2): visited_nodes = set() max_length = 0 for node in nx.algorithms.dfs_preorder_nodes(graph, source): seq = targets[node] if node in targets else queries[node] if len(seq) > max_length: max_length = len(seq) if node in targets: sequtil.write_fasta(ofile1, str(seq), 60, node) else: sequtil.write_fasta(ofile2, str(seq), 60, node) visited_nodes.add(node) return visited_nodes, max_length
def get_sequence(infile, genome): reader = csv.reader(open(infile), dialect='excel-tab') for rec in reader: attrs = dict([ (key, value) for key, value in [item.split('=') for item in rec[-1].split(';')] ]) exon_start = int(attrs["exonStart"]) - 1 exon_end = int(attrs["exonEnd"]) chrom = rec[0] strand = attrs["strand"] id = attrs["ID"] exon_seq = genome[chrom][exon_start:exon_end] if strand == '-': exon_seq = str(-exon_seq) else: exon_seq = str(exon_seq) sequtil.write_fasta(sys.stdout, exon_seq, id=id)
def get_sequence(cluster_trees, fasta_file, trans_db): print >> sys.stderr, 'reading %s' % fasta_file sequences = seqdb.SequenceFileDB(fasta_file, verbose=False) cluster_no = 0 for chrom, cluster_tree in cluster_trees.items(): for start, end, trans_nos in cluster_tree.getregions(): print >> sys.stderr, '%d\t%s:%d-%d\t%d' \ % (cluster_no, chrom, start, end, len(trans_nos)) with open('locus_%d.fa' % (cluster_no), 'w') as op: for no in trans_nos: seqid = trans_db[no].trans_id try: sequence = sequences[seqid].seq except KeyError: pass else: sequtil.write_fasta( op, sequence, id=seqid, ) cluster_no += 1
import sys from pygr import seqdb, sequtil inputFile = sys.argv[1] genome = seqdb.SequenceFileDB(sys.argv[2]) for n, line in enumerate(open(inputFile), start=1): features = line.split() chrom = features[0] start = int(features[3]) - 1 end = start + 150 snpid = "%s:%d" % (chrom, start) seq = genome[chrom][start:end] sequtil.write_fasta(sys.stdout, seq, id=snpid) if (n % 1000) == 0: print >> sys.stderr, '...', n
''' import sys from pygr import seqdb, sequtil if len(sys.argv) < 4: print >> sys.stderr, \ 'Usage: split_sequence.py fasta_file chunk_size overlap_size' raise SystemExit input_file = sys.argv[1] chunk_size = int(sys.argv[2]) overlap_size = int(sys.argv[3]) db = seqdb.SequenceFileDB(input_file) for seq in db: window = 0 print >> sys.stderr, 'Splitting %s...' % (seq) if len(db[seq]) <= chunk_size: sequtil.write_fasta(sys.stdout, str(db[seq]), id=seq) else: seq = db[seq] _id = 1 chunk_id = "%s_%d" % (seq.id, _id) while window < len(seq): chunk = seq[window:window + chunk_size] sequtil.write_fasta(sys.stdout, str(chunk), id=chunk_id) _id += 1 window += (chunk_size - overlap_size)
'''Get a part of a target sequence that aligned to a given query sequence.''' import sys import csv from pygr import seqdb, sequtil psl_file = sys.argv[1] genome_file = sys.argv[2] genome = seqdb.SequenceFileDB(genome_file) reader = csv.reader(open(psl_file), dialect='excel-tab') for cols in reader: target = cols[13] start = int(cols[15]) end = int(cols[16]) seq = genome[target][start:end] seqid = target + '_' + cols[9] sequtil.write_fasta(sys.stdout, str(seq), id=seqid)