def main(gff, logger=None, noncanonical_gene=False): function4gff.FIX_MISSING_ATTR(gff, logger=logger) roots = [] for line in gff.lines: try: if line['line_type'] == 'feature' and not line[ 'attributes'].has_key('Parent'): roots.append(line) except: logger.warning( '[Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}' .format(str(line['line_index'] + 1), line['line_raw'])) #roots = [line for line in gff.lines if line['line_type']=='feature' and not line['attributes'].has_key('Parent')] error_set = list() for root in roots: r = check_pseudo_child_type(gff, root) if not r == None: error_set.extend(r) r = None r = check_redundant_length(gff, root) if not r == None: error_set.extend(r) r = None if noncanonical_gene == False: r = check_incomplete(gff, root) if not r == None: error_set.extend(r) r = None if noncanonical_gene == False: r = check_internal_stop(gff, root) if not r == None: error_set.extend(r) r = None if noncanonical_gene == False: r = check_distinct_isoform(gff, root) if not r == None: error_set.extend(r) r = None if noncanonical_gene == False: r = check_merged_gene_parent(gff, root) if not r == None: error_set.extend(r) r = None # for e in error_set: # print('{3:s}\t{0:s}\t{1:s}\t{2:s}\n'.format(e['ID'], e['eCode'], e['eTag'], e['line_num'])) if len(error_set): return (error_set)
def main(gff, logger=None): function4gff.FIX_MISSING_ATTR(gff, logger=logger) FIX_PSEUDOGENE(gff) features = [line for line in gff.lines if line['line_type'] == 'feature'] error_set = list() for f in features: r = check_pseudogene(gff, f) if not r == None: error_set.extend(r) r = None r = check_strand(gff, f) if not r == None: error_set.extend(r) r = None if len(error_set): return (error_set)
def main(gff, gff_file, fasta_file, logger=None, noncanonical_gene=False): function4gff.FIX_MISSING_ATTR(gff, logger=logger) roots = [] for line in gff.lines: try: if line['line_type'] == 'feature' and 'Parent' not in line[ 'attributes']: roots.append(line) except: logger.warning( '[Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}' .format(str(line['line_index'] + 1), line['line_raw'])) #roots = [line for line in gff.lines if line['line_type']=='feature' and 'Parent' not in line['attributes']] error_set = list() trans_list = list() for root in roots: children = root['children'] for child in children: trans_list.append(child) r = None if noncanonical_gene == False: r = check_duplicate(gff, trans_list) if r is not None: error_set.extend(r) r = None if noncanonical_gene == False: r = check_incorrectly_split_genes(gff, gff_file, fasta_file, logger) if r is not None: error_set.extend(r) r = None ''' for e in error_set: tag = '[{0:s}]'.format(ERROR_INFO[e['eCode']]) print(e['ID'], e['eCode'], tag) ''' if len(error_set): return (error_set)
def script_main(): logger_stderr = logging.getLogger(__name__ + 'stderr') logger_stderr.setLevel(logging.INFO) stderr_handler = logging.StreamHandler() stderr_handler.setFormatter( logging.Formatter('%(levelname)-8s %(message)s')) logger_stderr.addHandler(stderr_handler) logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) import argparse from textwrap import dedent parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\ Testing environment: 1. Python 2.7 Inputs: 1. GFF3: Specify the file name with the -g or --gff argument; Please note that this program requires gene/pseudogene and mRNA/pseudogenic_transcript to have an ID attribute in column 9. 2. fasta file: Specify the file name with the -f or --fasta argument Outputs: 1. Error report for the input GFF3 file * Line_num: Line numbers of the found problematic models in the input GFF3 file. * Error_code: Error codes for the found problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag. * Error_tag: Detail of the found errors for the problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag. Quick start: gff3_QC -g example_file/example.gff3 -f example_file/reference.fa -o test or gff3_QC --gff example_file/example.gff3 --fasta example_file/reference.fa --output test """)) parser.add_argument('-g', '--gff', type=str, help='Genome annotation file, gff3 format') parser.add_argument('-f', '--fasta', type=str, help='Genome sequences, fasta format') parser.add_argument( '-noncg', '--noncanonical_gene', action="store_true", help='gff3 file is not formatted in the canonical gene model format.') parser.add_argument( '-i', '--initial_phase', action="store_true", help='Check whether initial CDS phase is 0 (default: no check)') parser.add_argument( '-n', '--allowed_num_of_n', type=int, default=0, help= 'Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)' ) parser.add_argument( '-t', '--check_n_feature_types', nargs='*', default=['CDS'], help= 'Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")' ) parser.add_argument('-o', '--output', type=str, help='output file name (default: report.txt)') parser.add_argument('-s', '--statistic', type=str, help='statistic file name (default: statistic.txt)') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() if args.gff: logger_stderr.info('Checking gff file (%s)...', args.gff) elif not sys.stdin.isatty(): # if STDIN connected to pipe or file args.gff = sys.stdin logger_stderr.info('Reading from STDIN...') else: # no input parser.print_help() sys.exit(1) if args.fasta: logger_stderr.info('Checking genome fasta (%s)...', args.fasta) elif not sys.stdin.isatty(): # if STDIN connected to pipe or file args.fasta = sys.stdin logger_stderr.info('Reading from STDIN...') else: # no input parser.print_help() sys.exit(1) if args.allowed_num_of_n or args.check_n_feature_types: check_n = True else: check_n = False logger_stderr.info('Reading gff files: (%s)...\n', args.gff) gff3 = Gff3(gff_file=args.gff, fasta_external=args.fasta, logger=logger_null) logger_stderr.info('Checking errors in the gff files: (%s)...\n', args.gff) if not gff3.check_parent_boundary(): sys.exit() gff3.check_unresolved_parents() if args.noncanonical_gene == False: gff3.check_phase(args.initial_phase) gff3.check_reference(fasta_external=args.fasta, check_n=check_n, allowed_num_of_n=args.allowed_num_of_n, feature_types=args.check_n_feature_types) logger_stderr.info('\t- Checking missing attributes: (%s)...\n', 'function4gff.FIX_MISSING_ATTR()') function4gff.FIX_MISSING_ATTR(gff3, logger=logger_stderr) error_set = list() cmd = None cmd = function4gff.extract_internal_detected_errors(gff3) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking intra-model errors: (%s)...\n', args.gff) cmd = intra_model.main(gff3, logger=logger_stderr, noncanonical_gene=args.noncanonical_gene) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking inter-model errors: (%s)...\n', args.gff) cmd = inter_model.main(gff3, args.gff, args.fasta, logger=logger_stderr, noncanonical_gene=args.noncanonical_gene) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking single-feature errors: (%s)...\n', args.gff) cmd = single_feature.main(gff3, logger=logger_stderr) if cmd: error_set.extend(cmd) if args.output: logger_stderr.info('Print QC report at {0:s}'.format(args.output)) report_fh = open(args.output, 'w') else: logger_stderr.info('Print QC report at {0:s}'.format('report.txt')) report_fh = open('report.txt', 'w') if args.statistic: logger_stderr.info('Print QC statistic report at {0:s}'.format( args.statistic)) statistic_fh = open(args.statistic, 'w') else: logger_stderr.info( 'Print QC statistic report at {0:s}'.format('statistic.txt')) statistic_fh = open('statistic.txt', 'w') report_fh.write('Line_num\tError_code\tError_tag\n') for e in sorted(error_set, key=lambda x: sorted(x.keys())): tag = '[{0:s}]'.format(e['eTag']) report_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format(str(e['line_num']), str(e['eCode']), str(tag))) #statistic_file error_counts = dict() ERROR_INFO = ERROR.INFO statistic_fh.write('Error_code\tNumber_of_problematic_models\tError_tag\n') for s in sorted(error_set, key=lambda x: sorted(x.keys())): if s['eCode'] not in error_counts: error_counts[s['eCode']] = { 'count': 0, 'etag': ERROR_INFO[s['eCode']] } error_counts[s['eCode']]['count'] += 1 for a in error_counts: statistic_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format( str(a), str(error_counts[a]['count']), str(error_counts[a]['etag'])))