Python guess_seqformat Examples, bioutils.guess_seqformat Python Examples

Example #1

0

Show file

File: prune_aln_cols.py Project: aersoares81/compbio-utils

def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)
        
    if not opts.aln_in:
        parser.error("Missing input alignment argument")
        sys.exit(1)

    what = None
    if opts.any_gap:
        assert not what, ("Can only do one operation at a time")
        what = 'any_gap'
    if opts.all_gap:
        assert not what, ("Can only do one operation at a time")
        what = 'all_gap'
    if opts.identical:
        assert not what, ("Can only do one operation at a time")
        what = 'identical'
    if not what:
        parser.error("No operation selected")
        sys.exit(1)
        
    if opts.aln_in == "-":
        fh_in = sys.stdin
    else:
        fh_in = open(opts.aln_in, "rU")

    fmt = opts.informat
    if not fmt:
        fmt = bioutils.guess_seqformat(opts.aln_in)

    aln = AlignIO.read(fh_in, fmt)
    if fh_in != sys.stdin:
        fh_in.close()


    prune_aln(aln, what, sys.stdout)

Example #2

0

Show file

def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)

    if not opts.aln_in:
        parser.error("Missing input alignment argument")
        sys.exit(1)

    what = None
    if opts.any_gap:
        assert not what, ("Can only do one operation at a time")
        what = 'any_gap'
    if opts.all_gap:
        assert not what, ("Can only do one operation at a time")
        what = 'all_gap'
    if opts.identical:
        assert not what, ("Can only do one operation at a time")
        what = 'identical'
    if not what:
        parser.error("No operation selected")
        sys.exit(1)

    if opts.aln_in == "-":
        fh_in = sys.stdin
    else:
        fh_in = open(opts.aln_in, "rU")

    fmt = opts.informat
    if not fmt:
        fmt = bioutils.guess_seqformat(opts.aln_in)

    aln = AlignIO.read(fh_in, fmt)
    if fh_in != sys.stdin:
        fh_in.close()

    prune_aln(aln, what, sys.stdout)

Example #3

0

Show file

File: alnscore.py Project: anyone1985/compbio-utils

def main():
    """
    The main function
    """

    parser = cmdline_parser()
    args = parser.parse_args()

    if args.verbose:
        LOG.setLevel(logging.INFO)
    if args.debug:
        LOG.setLevel(logging.DEBUG)
    if not args.aln_in:
        parser.error("Missing input alignment argument\n")
        sys.exit(1)


    #char_set = "ACGTU"
    #char_set = "ACDEFGHIKLMNPQRSTVWY"
    #x = any
    #z = Gln or Glu
    #b = Asp or Asn
    char_set = "ACGTN"
    char_set_ambig = "N"
    
    LOG.warn("using hardcoded charset %s" % char_set)
    # FIXME auto-detection of alphabet)
    
    if args.aln_in != "-" and not os.path.exists(args.aln_in):
        LOG.fatal("Input alignment %s does not exist.\n" % args.aln_in)
        sys.exit(1)

    if args.aln_in == "-":
        fh = sys.stdin
        fmt = 'fasta'
    else:
        fmt = guess_seqformat(args.aln_in)
        fh = open(args.aln_in, "rU")
                
    entropy_per_col = []    
    seqid_per_col = []    
    # note: had one case where this happily read an unaligned file!?
    aln = AlignIO.read(fh, fmt)

    # if requested, get sequence record for the sequence we should
    # positions to
    map_to_seq = None
    if args.map_to:
        map_to_seq = [rec.seq for rec in aln if rec.id == args.map_to]
        if not len(map_to_seq):
            LOG.fatal("Couldn't find a sequence called %s in %s" % (
                args.map_to, fh.name))
            sys.exit(1)
        elif len(map_to_seq)>1:
            LOG.fatal("Find more than one sequence with name %s in %s" % (
                args.map_to, fh.name))
            sys.exit(1)
        map_to_seq = map_to_seq[0]
        map_to_seq_cols = unaln_pos_map(map_to_seq)

    ign_idxs = []
    if args.ign_seqs:
        for s in args.ign_seqs:
            found = False
            for (i, r) in enumerate([r.id for r in aln]):
                if r==s:
                    ign_idxs.append(i)
                    found = True
                    break
            if not found:
                LOG.warn("No match for ignore sequence %s in alignment" % s)
    LOG.debug("ign_idxs = %s" % ign_idxs)

    
    for cidx in xrange(aln.get_alignment_length()):
        col =  list(aln[:, cidx].upper())
        
        # ignore chars as requested from ign_seqs
        for i in ign_idxs:
            del col[i]
        del i

        # replace unknown characters with ambiguity symbol
        unknown_chars = []
        for (i, c) in enumerate(col):
            if c not in char_set and c not in GAP_CHARS:
                unknown_chars.append(c)
                col[i] = char_set_ambig
            elif c in GAP_CHARS:
                col[i] = "-"
                
        unknown_chars = set(unknown_chars)
        if len(unknown_chars):
            LOG.warn("Found unknown characters in col %d (%s) and replaced them with %c" % (
                cidx+1, unknown_chars, char_set_ambig))
            
        counter = Counter(col)
        denom = sum(counter.values())
        if denom == 0:
            LOG.warn("No valid chars in col %d (col=%s)?" % (cidx+1, col))
            #import pdb; pdb.set_trace()
            #raise ValueError
            entropy_per_col.append(-1)
            seqid_per_col.append(-1)
        else:
            vec = []
            # count gaps for entropy 
            for res in list(char_set) + ["-"]:
                vec.append(counter[res]/float(denom))
            LOG.debug("vec=%s denom=%s counter=%s" % (vec, denom, counter))
            entropy_per_col.append(shannon_entropy(vec))
            seqid_per_col.append(seqid(counter))


        # due to the fact that we keep all values (which is actually
        # not necessary but would come in handy if values were
        # precalculated) we cannot simply continue or there would be
        # some missing. 'continue/next' here if needed.
        if map_to_seq and map_to_seq[cidx] in GAP_CHARS:
            LOG.debug("Skipping col %d because map_to_seq has gap there." % (cidx+1))
            continue

        counts_str = ' '.join(
            ["%s:%d" % (k,v) for (k,v) in sorted(counter.iteritems())])
        if not map_to_seq:
            rep_col = cidx
        else: 
            rep_col = map_to_seq_cols[cidx]
        print "%d %.6f %.6f %s" % (
            rep_col+1 if not map_to_seq else map_to_seq_cols[cidx]+1, 
            seqid_per_col[cidx], entropy_per_col[cidx], counts_str)

    if fh != sys.stdout:
        fh.close()

Example #4

0

Show file

File: seqgrep.py Project: aersoares81/compbio-utils

def main():
    """
    The main function
    """


    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)
        
    if len(args)<2:
        parser.error("Need pattern and at least one seqfile as argument")
        sys.exit(1)

    
    # first arg is pattern. rest are files
    pattern_arg = args[0]
    if opts.revcomp:
        pattern_arg = str(Seq(pattern_arg).reverse_complement())
        LOG.info("Pattern after reverse complement: %s" % pattern_arg)
    seqfiles_arg = args[1:]
    LOG.debug("args=%s" % (args))
    LOG.debug("pattern_arg=%s" % (pattern_arg))
    LOG.debug("seqfiles_arg=%s" % (seqfiles_arg))

    if opts.ignore_case:
        regexp = re.compile(pattern_arg, flags=re.IGNORECASE)
    else:
        regexp = re.compile(pattern_arg)

        
    for fseq in seqfiles_arg:
        if fseq != "-" and not os.path.exists(fseq):
            LOG.fatal("input file %s does not exist.\n" % fseq)
            sys.exit(1)

    print_file_prefix = False
    if len(seqfiles_arg)>1:
        print_file_prefix = True
        
    for fseq in seqfiles_arg:
        if fseq == "-":
            fhandle = sys.stdin
        else:
            if fseq[-3:] == ".gz":
                fhandle = gzip.open(fseq)
            else:
                fhandle = open(fseq, "rU")
        
        fmt = bioutils.guess_seqformat(fseq)
        if not fmt:
            fmt = 'fasta'
        LOG.info("Checking file %s (format %s)" % (fseq, fmt))
        
        for record in SeqIO.parse(fhandle, fmt):
            #LOG.debug("checking seq %s (len %d)" % (record.id, len(record.seq)))

            if opts.search_in == 'seq':
                target = record.seq
            elif opts.search_in == 'id':
                # special case fasta: id is everything before the
                # first whitespace. description contains this as well.
                if fmt == 'fasta':
                    target = record.description
                else:
                    target = record.id
            else:
                raise ValueError, (
                    "internal error...not sure where to search in")
            
            target = str(target)
            match = regexp.search(target) 
            print_match = False
            if match and not opts.invert_match:
                LOG.debug("match.string=%s" % match.string)
                print_match = True
            elif opts.invert_match and not match:
                print_match = True

            if print_match:
                #import pdb; pdb.set_trace()
                prefix = ""
                if print_file_prefix:
                    prefix = fseq + ":"
                if fmt == 'fasta':
                    print "%s>%s\n%s%s" % (prefix, record.description, prefix, record.seq)
                else:
                    print "%s>%s\n%s%s" % (prefix, record.id, prefix, record.seq)
        if fhandle != sys.stdin:
            fhandle.close()

Example #5

0

Show file

File: alnscore.py Project: aersoares81/compbio-utils

def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)
    if not opts.aln_in:
        parser.error("Missing input alignment argument\n")
        sys.exit(1)
    if len(args):
        parser.error("Unrecognized arguments found: %s" % args)
        
    char_set = "ACGTU"
    char_set = "ACDEFGHIKLMNPQRSTVWY"
    #x = any
    #z = Gln or Glu
    #b = Asp or Asn
    char_set = "ACGTN"

    LOG.warn("using hardcoded charset %s" % char_set)
    # FIXME auto-detection of alphabet)
    
    if opts.aln_in != "-" and not os.path.exists(opts.aln_in):
        LOG.fatal("Input alignment %s does not exist.\n" % opts.aln_in)
        sys.exit(1)

    if opts.aln_in == "-":
        fh = sys.stdin
        fmt = 'fasta'
    else:
        fmt = bioutils.guess_seqformat(opts.aln_in)
        fh = open(opts.aln_in, "rU")
                
    entropy_per_col = []    
    seqid_per_col = []    
    # note: had one case where this happily read an unaligned file!?
    aln = AlignIO.read(fh, fmt)

    # if requested, get sequence record for the sequence we should
    # positions to
    map_to_seq = None
    if opts.map_to:
        map_to_seq = [rec.seq for rec in aln if rec.id == opts.map_to]
        if not len(map_to_seq):
            LOG.fatal("Couldn't find a sequence called %s in %s" % (
                opts.map_to, fh.name))
            sys.exit(1)
        elif len(map_to_seq)>1:
            LOG.fatal("Find more than one sequence with name %s in %s" % (
                opts.map_to, fh.name))
            sys.exit(1)
        map_to_seq = map_to_seq[0]
        map_to_seq_cols = unaln_pos_map(map_to_seq)
        
    for i in xrange(aln.get_alignment_length()):
        #col = aln.get_column(i) # deprecated
        col = get_aln_column(aln, i).upper()
        not_in_char_set = [c for c in col if c not in char_set]
        not_in_char_set = [c for c in not_in_char_set if c not in bioutils.GAP_CHARS]        
        if len(not_in_char_set):
            LOG.warn("Found characters not in char_set (%s) in col %d (%s)" % (
                char_set, i+1, set(not_in_char_set)))
        counter = Counter(col)

        vec = []
        # this will ignore invalid chars incl. ambiguities        
        denom = sum([counter[r] for r in char_set])
        if denom == 0:
            LOG.fatal("denom = 0, means no valid chars in col %d?" % (i+1))
            #import pdb; pdb.set_trace()
            raise ValueError
        for res in char_set:
            vec.append(counter[res]/float(denom))
        LOG.debug("vec=%s denom=%s counter=%s" % (vec, denom, counter))
        entropy_per_col.append(shannon_entropy(vec))

        seqid_per_col.append(seqid(counter))

        # due to the fact that we keep all values (which is actually
        # not necessary but would come in handy if values were
        # precalculated) we cannot simply continue or there would be
        # some missing. 'continue/next' here if needed.
        if map_to_seq and map_to_seq[i] in bioutils.GAP_CHARS:
            LOG.debug("Skipping col %d because map_to_seq has gap there." % (i+1))
            continue

        counts_str = ' '.join(
            ["%s:%d" % (k,v) for (k,v) in sorted(counter.iteritems())])
        if not map_to_seq:
            rep_col = i
        else: 
            rep_col = map_to_seq_cols[i]
        print "%d %.6f %.6f %s" % (
            rep_col+1 if not map_to_seq else map_to_seq_cols[i]+1, 
            seqid_per_col[i], entropy_per_col[i], counts_str)

    if fh != sys.stdout:
        fh.close()

Example #6

0

Show file

File: seqstat.py Project: aersoares81/compbio-utils

def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)
        
    if len(args) != 1:
        parser.error("Need sequence file as input argument")
        sys.exit(1)

        
    fseq = args[0]
    if fseq == "-":
        fhandle = sys.stdin
    else:
        fhandle = open(fseq, "rU")


    fmt = opts.informat
    if not fmt:
        fmt = bioutils.guess_seqformat(fseq)


    seqrecs = []
    seqlens = []
    seqlens_ungapped = []
    
    # read all into memory: makes id computation easier. the file
    # might come from stdin so we can't read twice
    for seqrec in SeqIO.parse(fhandle, fmt):
        seqrecs.append(seqrec)
        seqlens.append(len(seqrec.seq))
        seqlens_ungapped.append(len(bioutils.ungap(str(seqrec.seq))))
    if fhandle != sys.stdin:
        fhandle.close()
            
    nseqs = len(seqlens)        
    if nseqs == 0:
        LOG.warn('No sequences found. Try changing the format (just tried: %s)' % fmt)
        sys.exit(0)

    
    aligned = False
    if nseqs > 1 and len(set(seqlens)) == 1:
        # add and len(set(seqlens_ungapped)) != 1 to make sure
        # unaligend sequence length are identical
        aligned = True
        aln_len = seqlens[0] # any will do as we know they're aligned
        pw_id_mx = comp_pairwise_ident_matrix(seqrecs)

    if not aligned and seqlens != seqlens_ungapped:
        LOG.warn("Found gaps, but sequences do not seem to be aligned."
                 " Stats will be for ungapped seqs.")
         
    # guess type from first entry
    if guess_if_nucleic_acid(seqrecs[0].seq):
        seqtype = 'protein' 
    else:
        seqtype = 'nucleic'
    print "Type (of 1st seq):   %s" % (seqtype)

    print "Number of sequences: %d" % (nseqs)        
    print "Smallest:            %d" % (
        min(seqlens_ungapped))
    print "Largest:             %d" % (
        max(seqlens_ungapped))
    print "Average length:      %.1f" % (
        sum(seqlens_ungapped)/float(len(seqlens_ungapped)))
    #print "Format:              %s" % (fmt)
    
    print "Aligned:             %s" % ("yes" if aligned else "no")
    if aligned:
        # make sure to ignore self-comparison None's
        flat_pw_id_mx = [x for x in chain.from_iterable(pw_id_mx) if x]
        print "Alignment length:    %d" % (aln_len)        
        (mean, std) = meanstd(flat_pw_id_mx)
        print "Average identity:    %0.2f" % (
            mean)
        print "Standard deviation:  %0.2f" % (
            std)
        print "Most related pair:   %0.2f" % (
            max(flat_pw_id_mx))
        print "Most unrelated pair: %0.2f" % (
            min(flat_pw_id_mx))
    
    if opts.info_for_all:
        # spacer
        print ""
        
        header = "# Name\tLength"
        if aligned:
            header += "\thigh-id to\tlow-id to"
        print header
        
        for (i, seqrec) in enumerate(seqrecs):
            line = "%s\t%d" % (
                seqrec.id, seqlens_ungapped[i])
            
            if aligned:
                # construct list of pairwise ids from fake matrix. 
                pw_ids = pw_id_mx[i]
                pw_ids.extend([pw_id_mx[j][i] 
                               for j in xrange(i+1, nseqs)])
                assert len(pw_ids) == nseqs, (
                    "len(pw_ids)=%d, but expected %d" % (len(pw_ids), nseqs))

                # Find min and max and corresponding partner index,
                # but take care to ignore self-comparison value 'None'
                pw_ids[i] = -1.0
                (pw_id_max_idx, pw_id_max_val) = argminmax(pw_ids, 'max')
                pw_ids[i] = 1.1
                (pw_id_min_idx, pw_id_min_val) = argminmax(pw_ids, 'min')
                pw_ids[i] = None # reset even though not strictly necessary

                line += "\t%.4f\t%s\t%.4f\t%s" % (
                    pw_id_max_val, seqrecs[pw_id_max_idx].id,
                    pw_id_min_val, seqrecs[pw_id_min_idx].id)
            print line

    print "%d names are unique and %d sequences are unique (including gaps)." % (
        len(set([s.id for s in seqrecs])),
        len(set([str(s.seq) for s in seqrecs])))

Example #7

0

Show file

def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)

    if len(args) != 1:
        parser.error("Need sequence file as input argument")
        sys.exit(1)

    fseq = args[0]
    if fseq == "-":
        fhandle = sys.stdin
    else:
        fhandle = open(fseq, "rU")

    fmt = opts.informat
    if not fmt:
        fmt = bioutils.guess_seqformat(fseq)

    seqrecs = []
    seqlens = []
    seqlens_ungapped = []

    # read all into memory: makes id computation easier. the file
    # might come from stdin so we can't read twice
    for seqrec in SeqIO.parse(fhandle, fmt):
        seqrecs.append(seqrec)
        seqlens.append(len(seqrec.seq))
        seqlens_ungapped.append(len(bioutils.ungap(str(seqrec.seq))))
    if fhandle != sys.stdin:
        fhandle.close()

    nseqs = len(seqlens)
    if nseqs == 0:
        LOG.warn(
            'No sequences found. Try changing the format (just tried: %s)' %
            fmt)
        sys.exit(0)

    aligned = False
    if nseqs > 1 and len(set(seqlens)) == 1:
        # add and len(set(seqlens_ungapped)) != 1 to make sure
        # unaligend sequence length are identical
        aligned = True
        aln_len = seqlens[0]  # any will do as we know they're aligned
        pw_id_mx = comp_pairwise_ident_matrix(seqrecs)

    if not aligned and seqlens != seqlens_ungapped:
        LOG.warn("Found gaps, but sequences do not seem to be aligned."
                 " Stats will be for ungapped seqs.")

    # guess type from first entry
    if guess_if_nucleic_acid(seqrecs[0].seq):
        seqtype = 'protein'
    else:
        seqtype = 'nucleic'
    print "Type (of 1st seq):   %s" % (seqtype)

    print "Number of sequences: %d" % (nseqs)
    print "Smallest:            %d" % (min(seqlens_ungapped))
    print "Largest:             %d" % (max(seqlens_ungapped))
    print "Average length:      %.1f" % (sum(seqlens_ungapped) /
                                         float(len(seqlens_ungapped)))
    #print "Format:              %s" % (fmt)

    print "Aligned:             %s" % ("yes" if aligned else "no")
    if aligned:
        # make sure to ignore self-comparison None's
        flat_pw_id_mx = [x for x in chain.from_iterable(pw_id_mx) if x]
        print "Alignment length:    %d" % (aln_len)
        (mean, std) = meanstd(flat_pw_id_mx)
        print "Average identity:    %0.2f" % (mean)
        print "Standard deviation:  %0.2f" % (std)
        print "Most related pair:   %0.2f" % (max(flat_pw_id_mx))
        print "Most unrelated pair: %0.2f" % (min(flat_pw_id_mx))

    if opts.info_for_all:
        # spacer
        print ""

        header = "# Name\tLength"
        if aligned:
            header += "\thigh-id to\tlow-id to"
        print header

        for (i, seqrec) in enumerate(seqrecs):
            line = "%s\t%d" % (seqrec.id, seqlens_ungapped[i])

            if aligned:
                # construct list of pairwise ids from fake matrix.
                pw_ids = pw_id_mx[i]
                pw_ids.extend([pw_id_mx[j][i] for j in xrange(i + 1, nseqs)])
                assert len(pw_ids) == nseqs, (
                    "len(pw_ids)=%d, but expected %d" % (len(pw_ids), nseqs))

                # Find min and max and corresponding partner index,
                # but take care to ignore self-comparison value 'None'
                pw_ids[i] = -1.0
                (pw_id_max_idx, pw_id_max_val) = argminmax(pw_ids, 'max')
                pw_ids[i] = 1.1
                (pw_id_min_idx, pw_id_min_val) = argminmax(pw_ids, 'min')
                pw_ids[i] = None  # reset even though not strictly necessary

                line += "\t%.4f\t%s\t%.4f\t%s" % (
                    pw_id_max_val, seqrecs[pw_id_max_idx].id, pw_id_min_val,
                    seqrecs[pw_id_min_idx].id)
            print line

    print "%d names are unique and %d sequences are unique (including gaps)." % (
        len(set([s.id
                 for s in seqrecs])), len(set([str(s.seq) for s in seqrecs])))

Example #8

0

Show file

File: gb_annotation_transfer.py Project: aersoares81/compbio-utils

def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)
        
    if len(args) != 0:
        parser.error("Unrecognized args found")
        sys.exit(1)

    for (f, d) in [(opts.pw_aln, "Pairwise alignment"),
                   (opts.ref_gb, "Reference Genbank")]:
        if not f:
            parser.error("Missing %s argument" % d)
        if not os.path.exists(f):
            LOG.fatal("%s file '%s' does not exist" % (d, f))
            sys.exit(1)


    refseq = list(SeqIO.parse(opts.ref_gb, "genbank"))
    assert len(refseq)==1
    refseq = refseq[0]


    pw_aln = list(SeqIO.parse(opts.pw_aln, bioutils.guess_seqformat(opts.pw_aln)))
    assert len(pw_aln)==2, (
        "Was expecting two sequences, but parsed %d from %s" % (
            len(pw_aln), opts.pw_aln))


    # determine ref id
    #
    # seqids in alignment should match genbank id but might not
    matches = difflib.get_close_matches(refseq.id, [s.id for s in pw_aln])
    assert len(matches), (
        "Couldn't find a sensible match between sequence ids in alignment and genbank")
    aln_ref_id = matches[0]
    if aln_ref_id != refseq.id:
        LOG.warn("Assuming %s (from alignment) is the same as %s (from Genbank)" % (
            aln_ref_id, refseq.id))
    LOG.info("%s is the ref id" % (aln_ref_id))

    # determine query id
    assert len(pw_aln) == 2
    for sid in [s.id for s in pw_aln]:
        if sid != aln_ref_id:
            query_id = sid
    LOG.info("%s is the query id" % (query_id))
    
    pos_map = PosMap(pw_aln)
    #pos_map.output()
    ref_to_query_map = pos_map.convert(aln_ref_id, query_id)

    print "#QUERY-POS (%s)\tREF-POS (%s)\tSTRAND\tTYPE\tQUALIFIERS" % (
        query_id, aln_ref_id)
    for feat in refseq.features:
        query_pos_str =  "%d-%d" % (
            ref_to_query_map[feat.location.start.position+1],
            ref_to_query_map[feat.location.end.position])
        orig_pos_str = "%d-%d" % (
            feat.location.start.position+1,
            feat.location.end.position)
        strand_str = "%s" % feat.strand
        type_str = "%s" % feat.type
        qualifiers_str = '; '.join("%s %s" % (k, ', '.join(v))
                                    for (k, v) in feat.qualifiers.iteritems() 
                                    if k != 'translation')
        print '\t'.join([query_pos_str, orig_pos_str, 
                         strand_str, type_str, qualifiers_str])
    LOG.info("Note: feature positions might overlap")

Example #9

0

Show file

File: gb_annotation_transfer.py Project: anyone1985/compbio-utils

def main():
    """
    The main function
    """

    parser = cmdline_parser()
    (opts, args) = parser.parse_args()

    if opts.verbose:
        LOG.setLevel(logging.INFO)
    if opts.debug:
        LOG.setLevel(logging.DEBUG)

    if len(args) != 0:
        parser.error("Unrecognized args found")
        sys.exit(1)

    for (f, d) in [(opts.pw_aln, "Pairwise alignment"),
                   (opts.ref_gb, "Reference Genbank")]:
        if not f:
            parser.error("Missing %s argument" % d)
        if not os.path.exists(f):
            LOG.fatal("%s file '%s' does not exist" % (d, f))
            sys.exit(1)

    refseq = list(SeqIO.parse(opts.ref_gb, "genbank"))
    assert len(refseq) == 1
    refseq = refseq[0]

    pw_aln = list(
        SeqIO.parse(opts.pw_aln, bioutils.guess_seqformat(opts.pw_aln)))
    assert len(pw_aln) == 2, (
        "Was expecting two sequences, but parsed %d from %s" %
        (len(pw_aln), opts.pw_aln))

    # determine ref id
    #
    # seqids in alignment should match genbank id but might not
    matches = difflib.get_close_matches(refseq.id, [s.id for s in pw_aln])
    assert len(matches), (
        "Couldn't find a sensible match between sequence ids in alignment and genbank"
    )
    aln_ref_id = matches[0]
    if aln_ref_id != refseq.id:
        LOG.warn(
            "Assuming %s (from alignment) is the same as %s (from Genbank)" %
            (aln_ref_id, refseq.id))
    LOG.info("%s is the ref id" % (aln_ref_id))

    # determine query id
    assert len(pw_aln) == 2
    for sid in [s.id for s in pw_aln]:
        if sid != aln_ref_id:
            query_id = sid
    LOG.info("%s is the query id" % (query_id))

    pos_map = PosMap(pw_aln)
    #pos_map.output()
    ref_to_query_map = pos_map.convert(aln_ref_id, query_id)

    print "#QUERY-POS (%s)\tREF-POS (%s)\tSTRAND\tTYPE\tQUALIFIERS" % (
        query_id, aln_ref_id)
    for feat in refseq.features:
        query_pos_str = "%d-%d" % (
            ref_to_query_map[feat.location.start.position + 1],
            ref_to_query_map[feat.location.end.position])
        orig_pos_str = "%d-%d" % (feat.location.start.position + 1,
                                  feat.location.end.position)
        strand_str = "%s" % feat.strand
        type_str = "%s" % feat.type
        qualifiers_str = '; '.join("%s %s" % (k, ', '.join(v))
                                   for (k, v) in feat.qualifiers.iteritems()
                                   if k != 'translation')
        print '\t'.join([
            query_pos_str, orig_pos_str, strand_str, type_str, qualifiers_str
        ])
    LOG.info("Note: feature positions might overlap")