Esempio n. 1
0
def main(gff_file=None,
         fasta_file=None,
         embedded_fasta=False,
         stype=None,
         user_defined=None,
         dline=None,
         qc=True,
         output_prefix=None,
         logger=None):
    stderr_handler = logging.StreamHandler()
    stderr_handler.setFormatter(
        logging.Formatter('%(levelname)-8s %(message)s'))
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)

    if not gff_file or (not fasta_file and not embedded_fasta) or not stype:
        print(
            'Gff file, fasta file, and type of extracted sequences need to be specified'
        )
        sys.exit(1)
    type_set = [
        'gene', 'exon', 'pre_trans', 'trans', 'cds', 'pep', 'all',
        'user_defined'
    ]
    if not stype in type_set:
        logger.error(
            'Your sequence type is "{0:s}". Sequence type must be one of {1:s}!'
            .format(stype, str(type_set)))
        sys.exit(1)

    if stype == 'all' and output_prefix:
        pass
    elif stype != 'all' and output_prefix:
        logger.info('Specifying prefix of output file name: (%s)...',
                    output_prefix)
        fname = '{0:s}_{1:s}.fa'.format(output_prefix, stype)
        report_fh = open(fname, 'w')
    else:
        print('[Error] Please specify the prefix of output file name...')
        sys.exit(1)
    if stype == 'user_defined' and user_defined != None:
        if len(user_defined) != 2:
            logger.error(
                'Please specify parent and child feature via the -u argument. Format: [parent feature type],[child feature type]'
            )
            sys.exit(1)
    elif stype != 'user_defined' and user_defined != None:
        logger.warning(
            'Your sequence type is "{0:s}", -u argument will be ignored.'.
            format(stype))
    elif stype == 'user_defined' and user_defined == None:
        logger.error('-u is needed in combination with -st user_defined.')
        sys.exit(1)

    logger.info('Reading files: {0:s}, {1:s}...'.format(gff_file, fasta_file))
    gff = None

    if qc:
        initial_phase = False
        gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger)
        if embedded_fasta and len(gff.fasta_embedded) == 0:
            logger.error('There is no embedded fasta in the GFF3 file.')
            sys.exit(1)
        logger.info('Checking errors...')
        gff.check_parent_boundary()
        gff.check_phase(initial_phase)
        gff.check_reference()
        error_set = function4gff.extract_internal_detected_errors(gff)
        t = intra_model.main(gff, logger=logger)
        if t:
            error_set.extend(t)
        t = single_feature.main(gff, logger=logger)
        if t:
            error_set.extend(t)

        if error_set and len(error_set):
            escaped_error = ['Esf0012', 'Esf0033']
            eSet = list()
            for e in error_set:
                if not e['eCode'] in escaped_error:
                    eSet.append(e)
            if len(eSet):
                logger.warning(
                    'The extracted sequences might be wrong for the following features which have formatting errors...'
                )
                print('ID\tError_Code\tError_Tag')
                for e in eSet:
                    tag = '[{0:s}]'.format(e['eTag'])
                    print(e['ID'], e['eCode'], tag)
    else:
        gff = Gff3(gff_file=gff_file,
                   fasta_external=fasta_file,
                   logger=logger_null)
        if embedded_fasta and len(gff.fasta_embedded) == 0:
            logger.error('There is no embedded fasta in the GFF3 file.')

    logger.info('Extract sequences for {0:s}...'.format(stype))
    seq = dict()
    if stype == 'all':
        if output_prefix:
            logger.info('Specifying prefix of output file name: (%s)...',
                        output_prefix)
            pass
        else:
            print('[Error] Please specify the prefix of output file name...')
            sys.exit(1)

        tmp_stype = 'pre_trans'
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'gene'
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'exon'
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'trans'
        feature_type = ['exon', 'pseudogenic_exon']
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'cds'
        feature_type = ['CDS']
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

        seq = dict()
        tmp_stype = 'pep'
        feature_type = ['CDS']
        logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype))
        tmpseq = splicer(gff, feature_type, dline, tmp_stype, embedded_fasta)
        for k, v in tmpseq.items():
            k = k.replace("|mRNA(CDS)|", "|peptide|")
            v = translator(v)
            seq[k] = v
        if len(seq):
            fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype)
            report_fh = open(fname, 'w')
            logger.info(
                '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, tmp_stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))
    elif stype == 'user_defined':
        feature_type = [user_defined[0], user_defined[1]]
        seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        if len(seq):
            logger.info(
                'Print out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))

    else:
        if stype == 'pre_trans' or stype == 'gene' or stype == 'exon':
            seq = extract_start_end(gff, stype, dline, embedded_fasta)
        elif stype == 'trans':
            feature_type = ['exon', 'pseudogenic_exon']
            seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        elif stype == 'cds':
            feature_type = ['CDS']
            seq = splicer(gff, feature_type, dline, stype, embedded_fasta)
        elif stype == 'pep':
            feature_type = ['CDS']
            tmpseq = splicer(gff, feature_type, dline, stype, embedded_fasta)
            for k, v in tmpseq.items():
                k = k.replace("|mRNA(CDS)|", "|peptide|")
                #k = re.sub(r'(.*-)(R)(.)',r'\1P\3',k)
                v = translator(v)
                seq[k] = v
        if len(seq):
            logger.info(
                'Print out extracted sequences: {0:s}_{1:s}.fa...'.format(
                    output_prefix, stype))
            for k, v in seq.items():
                if len(k) != 0 and len(v) != 0:
                    report_fh.write('{0:s}\n{1:s}\n'.format(k, v))
Esempio n. 2
0
def script_main():
    logger_stderr = logging.getLogger(__name__ + 'stderr')
    logger_stderr.setLevel(logging.INFO)
    stderr_handler = logging.StreamHandler()
    stderr_handler.setFormatter(
        logging.Formatter('%(levelname)-8s %(message)s'))
    logger_stderr.addHandler(stderr_handler)
    logger_null = logging.getLogger(__name__ + 'null')
    null_handler = logging.NullHandler()
    logger_null.addHandler(null_handler)
    import argparse
    from textwrap import dedent
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=dedent("""\

    Testing environment:
    1. Python 2.7

    Inputs:
    1. GFF3: Specify the file name with the -g or --gff argument; Please note that this program requires gene/pseudogene and mRNA/pseudogenic_transcript to have an ID attribute in column 9.
    2. fasta file: Specify the file name with the -f or --fasta argument

    Outputs:
    1. Error report for the input GFF3 file
	* Line_num: Line numbers of the found problematic models in the input GFF3 file.
	* Error_code: Error codes for the found problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag.
        * Error_tag: Detail of the found errors for the problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag.

    Quick start:
    gff3_QC -g example_file/example.gff3 -f example_file/reference.fa -o test
    or
    gff3_QC --gff example_file/example.gff3 --fasta example_file/reference.fa --output test

    """))
    parser.add_argument('-g',
                        '--gff',
                        type=str,
                        help='Genome annotation file, gff3 format')
    parser.add_argument('-f',
                        '--fasta',
                        type=str,
                        help='Genome sequences, fasta format')
    parser.add_argument(
        '-noncg',
        '--noncanonical_gene',
        action="store_true",
        help='gff3 file is not formatted in the canonical gene model format.')
    parser.add_argument(
        '-i',
        '--initial_phase',
        action="store_true",
        help='Check whether initial CDS phase is 0 (default: no check)')
    parser.add_argument(
        '-n',
        '--allowed_num_of_n',
        type=int,
        default=0,
        help=
        'Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)'
    )
    parser.add_argument(
        '-t',
        '--check_n_feature_types',
        nargs='*',
        default=['CDS'],
        help=
        'Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")'
    )
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        help='output file name (default: report.txt)')
    parser.add_argument('-s',
                        '--statistic',
                        type=str,
                        help='statistic file name (default: statistic.txt)')
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='%(prog)s ' + __version__)

    args = parser.parse_args()
    if args.gff:
        logger_stderr.info('Checking gff file (%s)...', args.gff)
    elif not sys.stdin.isatty():  # if STDIN connected to pipe or file
        args.gff = sys.stdin
        logger_stderr.info('Reading from STDIN...')
    else:  # no input
        parser.print_help()
        sys.exit(1)
    if args.fasta:
        logger_stderr.info('Checking genome fasta (%s)...', args.fasta)
    elif not sys.stdin.isatty():  # if STDIN connected to pipe or file
        args.fasta = sys.stdin
        logger_stderr.info('Reading from STDIN...')
    else:  # no input
        parser.print_help()
        sys.exit(1)
    if args.allowed_num_of_n or args.check_n_feature_types:
        check_n = True
    else:
        check_n = False

    logger_stderr.info('Reading gff files: (%s)...\n', args.gff)
    gff3 = Gff3(gff_file=args.gff,
                fasta_external=args.fasta,
                logger=logger_null)
    logger_stderr.info('Checking errors in the gff files: (%s)...\n', args.gff)
    if not gff3.check_parent_boundary():
        sys.exit()
    gff3.check_unresolved_parents()
    if args.noncanonical_gene == False:
        gff3.check_phase(args.initial_phase)
    gff3.check_reference(fasta_external=args.fasta,
                         check_n=check_n,
                         allowed_num_of_n=args.allowed_num_of_n,
                         feature_types=args.check_n_feature_types)
    logger_stderr.info('\t- Checking missing attributes: (%s)...\n',
                       'function4gff.FIX_MISSING_ATTR()')
    function4gff.FIX_MISSING_ATTR(gff3, logger=logger_stderr)

    error_set = list()
    cmd = None
    cmd = function4gff.extract_internal_detected_errors(gff3)
    if cmd:
        error_set.extend(cmd)
    cmd = None
    logger_stderr.info('\t- Checking intra-model errors: (%s)...\n', args.gff)
    cmd = intra_model.main(gff3,
                           logger=logger_stderr,
                           noncanonical_gene=args.noncanonical_gene)
    if cmd:
        error_set.extend(cmd)
    cmd = None
    logger_stderr.info('\t- Checking inter-model errors: (%s)...\n', args.gff)
    cmd = inter_model.main(gff3,
                           args.gff,
                           args.fasta,
                           logger=logger_stderr,
                           noncanonical_gene=args.noncanonical_gene)
    if cmd:
        error_set.extend(cmd)
    cmd = None
    logger_stderr.info('\t- Checking single-feature errors: (%s)...\n',
                       args.gff)
    cmd = single_feature.main(gff3, logger=logger_stderr)
    if cmd:
        error_set.extend(cmd)
    if args.output:
        logger_stderr.info('Print QC report at {0:s}'.format(args.output))
        report_fh = open(args.output, 'w')
    else:
        logger_stderr.info('Print QC report at {0:s}'.format('report.txt'))
        report_fh = open('report.txt', 'w')

    if args.statistic:
        logger_stderr.info('Print QC statistic report at {0:s}'.format(
            args.statistic))
        statistic_fh = open(args.statistic, 'w')
    else:
        logger_stderr.info(
            'Print QC statistic report at {0:s}'.format('statistic.txt'))
        statistic_fh = open('statistic.txt', 'w')
    report_fh.write('Line_num\tError_code\tError_tag\n')
    for e in sorted(error_set, key=lambda x: sorted(x.keys())):
        tag = '[{0:s}]'.format(e['eTag'])
        report_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format(str(e['line_num']),
                                                       str(e['eCode']),
                                                       str(tag)))
    #statistic_file
    error_counts = dict()
    ERROR_INFO = ERROR.INFO
    statistic_fh.write('Error_code\tNumber_of_problematic_models\tError_tag\n')
    for s in sorted(error_set, key=lambda x: sorted(x.keys())):
        if s['eCode'] not in error_counts:
            error_counts[s['eCode']] = {
                'count': 0,
                'etag': ERROR_INFO[s['eCode']]
            }
        error_counts[s['eCode']]['count'] += 1
    for a in error_counts:
        statistic_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format(
            str(a), str(error_counts[a]['count']),
            str(error_counts[a]['etag'])))