# idlist, idlist will be an emtpy list if none is provided idlist = get_id_list(args) # read the sequences and store all that match the IDs # duplicates in sequence files will be stored twice n_match = {} # per file number of sequences in list n_notmatch = {} # per file number of sequences not in list n_sequence = {} # per file number of sequences n_found = {} # per ID, number of times found in all files n_file = 0 n_total = 0 n_written = 0 out = sys.stdout for fastafile in glob.glob(args.input_filename): fasta = Fasta() fasta.open(fastafile) if args.outsuffix: outfile = os.path.basename(fastafile) + f'{args.outsuffix}' out = opensafe(outfile, 'w') if not out: # if file can't be opened use stdout out = sys.stdout n_sequence[fastafile] = 0 n_match[fastafile] = 0 n_notmatch[fastafile] = 0 n_file += 1 while fasta.next(): n_sequence[fastafile] += 1 n_total += 1
# -------------------------------------------------------------------------------------------------- # main # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': # open files gtffile = sys.argv[1] try: gtf = open(gtffile, 'r') except: sys.stderr.write('Unable to open GTF file ({})\n'.format(gtffile)) exit(1) seq = {} fasta = Fasta() fasta.open(sys.argv[2]) sys.stderr.write('Reading Fasta {}...\n'.format(sys.argv[2])) nseq = 0 while fasta.next(): seq[fasta.id] = fasta.seq nseq += 1 sys.stderr.write('\n{} Sequences read from {}\n'.format(nseq, sys.argv[2])) for s in seq: sys.stderr.write('\t{} len={}\n'.format(s, len(seq[s]))) sys.stderr.write('\ngtf2fasta\n') sys.stderr.write('\tGTF: {}\n'.format(gtffile)) sys.stderr.write('\tFasta: {}\n'.format(sys.argv[2])) features = ('gene', 'pseudogene', 'tRNA')