Beispiel #1
0
def main():
    try:
        input_file = sys.argv[1]
        fasta_file = sys.argv[2]
    except IndexError:
        print >> sys.stderr, \
            'unmapped_seq2.py <text file> <fasta file> [min length=50]'
    try:
        min_length = int(sys.argv[3])
    except IndexError:
        min_length = 50

    db = seqdb.SequenceFileDB(fasta_file)

    input_sequences = set()
    print >> sys.stderr, 'Reading sequences...',
    for line in open(input_file):
        input_sequences.add(line.strip())
    print >> sys.stderr, 'total %d' % len(input_sequences)

    print >> sys.stderr, 'Writing unmapped sequences...'
    for seq in db:
        sequence = db[seq]
        if (seq not in input_sequences and len(sequence) >= min_length):
            print >> sys.stderr, 'Writing %s, %d bp' % (seq, len(sequence))
            sequtil.write_fasta(sys.stdout, str(sequence), id=seq)
Beispiel #2
0
def write_seq(filename, genome, output, strand):
    reader = csv.reader(open(filename), dialect='excel-tab')

    for n, line in enumerate(reader, start=1):
        chrom = line[0]
        chrom_start = int(line[1])
        gene_id = line[3]
        exon_starts = [int(start) + \
                chrom_start for start in line[-1].split(',')]
        exon_sizes = [int(size) for size in line[-2].split(',')]
        exon_ends = [exon_starts[i] + \
                exon_sizes[i] for i in range(len(exon_starts))]
        exons = [Exon(chrom, exon_starts[i], exon_ends[i]) for i in \
                range(len(exon_starts))]
        strand = 'negative' if line[5] == '-' else 'positive'

        if output == 'transcript':
            seq = get_sequence_transcript(genome, exons, strand)
            sequtil.write_fasta(sys.stdout, seq, id=gene_id)
        elif output == 'exon':
            seqs = get_sequence_exon(genome, exons, strand)

            for n, seq in enumerate(seqs, start=1):
                seq_id = gene_id + '_' + str(n)
                sequtil.write_fasta(sys.stdout, seq, id=seq_id)
        else:
            print >> sys.stderr, 'Unsupported output format.'
            raise SystemExit

        if n % 1000 == 0:
            print >> sys.stderr, '...', n
Beispiel #3
0
def main():
    try:
        input_file = sys.argv[1]
        fasta_file = sys.argv[2]
    except IndexError:
        print >> sys.stderr, \
            'unmapped_seq2.py <text file> <fasta file> [min length=50]'
    try:
        min_length = int(sys.argv[3])
    except IndexError:
        min_length = 50

    db = seqdb.SequenceFileDB(fasta_file)

    input_sequences = set()
    print >> sys.stderr, 'Reading sequences...',
    for line in open(input_file):
        input_sequences.add(line.strip())
    print >> sys.stderr, 'total %d' % len(input_sequences)

    print >> sys.stderr, 'Writing unmapped sequences...'
    for seq in db:
        sequence = db[seq]
        if (seq not in input_sequences and len(sequence) >= min_length):
            print >> sys.stderr, 'Writing %s, %d bp' % (seq, len(sequence))
            sequtil.write_fasta(sys.stdout, str(sequence), id=seq)
def main():
    filename = sys.argv[1]
    genome = seqdb.SequenceFileDB(sys.argv[2], verbose=False)
    for n, (exons, gene_id) in enumerate(
                    parse_seq(filename, genome), start=1):

        seq = get_sequence(genome, exons)
        sequtil.write_fasta(sys.stdout, seq, id=gene_id)

        if n % 1000 == 0:
            print >> sys.stderr, '...', n
Beispiel #5
0
def write_sequence(source, graph, targets, queries, ofile1, ofile2):
    visited_nodes = set()
    max_length = 0
    for node in nx.algorithms.dfs_preorder_nodes(graph, source):
        seq = targets[node] if node in targets else queries[node]

        if len(seq) > max_length: max_length = len(seq)

        if node in targets:
            sequtil.write_fasta(ofile1, str(seq), 60, node)
        else:
            sequtil.write_fasta(ofile2, str(seq), 60, node)

        visited_nodes.add(node)

    return visited_nodes, max_length
Beispiel #6
0
def write_sequence(source, graph, targets, queries, ofile1, ofile2):
    visited_nodes = set()
    max_length = 0
    for node in nx.algorithms.dfs_preorder_nodes(graph, source):
        seq = targets[node] if node in targets else queries[node]

        if len(seq) > max_length: max_length = len(seq)

        if node in targets:
            sequtil.write_fasta(ofile1, str(seq), 60, node)
        else:
            sequtil.write_fasta(ofile2, str(seq), 60, node)

        visited_nodes.add(node)

    return visited_nodes, max_length
Beispiel #7
0
def get_sequence(infile, genome):
    reader = csv.reader(open(infile), dialect='excel-tab')
    for rec in reader:
        attrs = dict([
            (key, value)
            for key, value in [item.split('=') for item in rec[-1].split(';')]
        ])
        exon_start = int(attrs["exonStart"]) - 1
        exon_end = int(attrs["exonEnd"])
        chrom = rec[0]
        strand = attrs["strand"]
        id = attrs["ID"]
        exon_seq = genome[chrom][exon_start:exon_end]
        if strand == '-':
            exon_seq = str(-exon_seq)
        else:
            exon_seq = str(exon_seq)

        sequtil.write_fasta(sys.stdout, exon_seq, id=id)
Beispiel #8
0
def get_sequence(cluster_trees, fasta_file, trans_db):
    print >> sys.stderr, 'reading %s' % fasta_file
    sequences = seqdb.SequenceFileDB(fasta_file, verbose=False)
    cluster_no = 0
    for chrom, cluster_tree in cluster_trees.items():
        for start, end, trans_nos in cluster_tree.getregions():
            print >> sys.stderr, '%d\t%s:%d-%d\t%d' \
                                % (cluster_no, chrom, start, end, len(trans_nos))
            with open('locus_%d.fa' % (cluster_no), 'w') as op:
                for no in trans_nos:
                    seqid = trans_db[no].trans_id
                    try:
                        sequence = sequences[seqid].seq
                    except KeyError:
                        pass
                    else:
                        sequtil.write_fasta(
                                            op,
                                            sequence,
                                            id=seqid,
                                        )
            cluster_no += 1
Beispiel #9
0
import sys
from pygr import seqdb, sequtil

inputFile = sys.argv[1]
genome = seqdb.SequenceFileDB(sys.argv[2])

for n, line in enumerate(open(inputFile), start=1):
    features = line.split()
    chrom = features[0]
    start = int(features[3]) - 1
    end = start + 150
    snpid = "%s:%d" % (chrom, start)
    seq = genome[chrom][start:end]
    sequtil.write_fasta(sys.stdout, seq, id=snpid)
    if (n % 1000) == 0:
        print >> sys.stderr, '...', n
Beispiel #10
0
'''

import sys
from pygr import seqdb, sequtil

if len(sys.argv) < 4:
    print >> sys.stderr, \
        'Usage: split_sequence.py fasta_file chunk_size overlap_size'
    raise SystemExit

input_file = sys.argv[1]
chunk_size = int(sys.argv[2])
overlap_size = int(sys.argv[3])

db = seqdb.SequenceFileDB(input_file)

for seq in db:
    window = 0
    print >> sys.stderr, 'Splitting %s...' % (seq)
    if len(db[seq]) <= chunk_size:
        sequtil.write_fasta(sys.stdout, str(db[seq]), id=seq)
    else:
        seq = db[seq]
        _id = 1
        chunk_id = "%s_%d" % (seq.id, _id)
        while window < len(seq):
            chunk = seq[window:window + chunk_size]
            sequtil.write_fasta(sys.stdout, str(chunk), id=chunk_id)
            _id += 1
            window += (chunk_size - overlap_size)
'''Get a part of a target sequence that aligned to a given query sequence.'''

import sys
import csv
from pygr import seqdb, sequtil

psl_file = sys.argv[1]
genome_file = sys.argv[2]
genome = seqdb.SequenceFileDB(genome_file)

reader = csv.reader(open(psl_file), dialect='excel-tab')
for cols in reader:
    target = cols[13]
    start = int(cols[15])
    end = int(cols[16])
    seq = genome[target][start:end]
    seqid = target + '_' + cols[9]
    sequtil.write_fasta(sys.stdout, str(seq), id=seqid)