Example #1
0
    # idlist, idlist will be an emtpy list if none is provided
    idlist = get_id_list(args)

    # read the sequences and store all that match the IDs
    # duplicates in sequence files will be stored twice
    n_match = {}  # per file number of sequences in list
    n_notmatch = {}  # per file number of sequences not in list
    n_sequence = {}  # per file number of sequences
    n_found = {}  # per ID, number of times found in all files
    n_file = 0
    n_total = 0
    n_written = 0
    out = sys.stdout
    for fastafile in glob.glob(args.input_filename):
        fasta = Fasta()
        fasta.open(fastafile)
        if args.outsuffix:
            outfile = os.path.basename(fastafile) + f'{args.outsuffix}'
            out = opensafe(outfile, 'w')
            if not out:
                # if file can't be opened use stdout
                out = sys.stdout

        n_sequence[fastafile] = 0
        n_match[fastafile] = 0
        n_notmatch[fastafile] = 0
        n_file += 1

        while fasta.next():
            n_sequence[fastafile] += 1
            n_total += 1
Example #2
0
# --------------------------------------------------------------------------------------------------
# main
# --------------------------------------------------------------------------------------------------
if __name__ == '__main__':

    # open files
    gtffile = sys.argv[1]
    try:
        gtf = open(gtffile, 'r')
    except:
        sys.stderr.write('Unable to open GTF file ({})\n'.format(gtffile))
        exit(1)

    seq = {}
    fasta = Fasta()
    fasta.open(sys.argv[2])
    sys.stderr.write('Reading Fasta {}...\n'.format(sys.argv[2]))
    nseq = 0
    while fasta.next():
        seq[fasta.id] = fasta.seq
        nseq += 1

    sys.stderr.write('\n{} Sequences read from {}\n'.format(nseq, sys.argv[2]))
    for s in seq:
        sys.stderr.write('\t{} len={}\n'.format(s, len(seq[s])))

    sys.stderr.write('\ngtf2fasta\n')
    sys.stderr.write('\tGTF: {}\n'.format(gtffile))
    sys.stderr.write('\tFasta: {}\n'.format(sys.argv[2]))

    features = ('gene', 'pseudogene', 'tRNA')