Python check_fasta_fastq_format Exemples, basic_seq_utilities.check_fasta_fastq_format Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : seq_uniqueness_check.py Projet : Jonikas-Lab/Zhang-Patena-2014

def main(infiles, args):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """

    if not infiles:
        parser.print_help()
        sys.exit("\nError: at least one infile and exactly one outfile are required!")

    all_names_and_seqs = []

    for infile in infiles:
        seq_format = check_fasta_fastq_format(infile)

        with open(infile) as INFILE:
            for sequence in SeqIO.parse(INFILE, seq_format):
                # using seq.tostring() to convert Biopython Seq objects to plain strings - Seq objects aren't hashable correctly
                all_names_and_seqs.append((sequence.name, sequence.seq.tostring()))

    no_repeats = True
    for (nameA, seqA), (nameB, seqB) in combinations(all_names_and_seqs, 2):
        result = check_pair(
            seqA, seqB, nameA, nameB, options.exact_identity_only, options.forward_only, options.ignore_empty_sequences
        )
        if result:
            print result
            no_repeats = False
    if no_repeats:
        print "NO REPEATS."

Exemple #2

0

Afficher le fichier

def main(infiles, total_seq_number_only=False, input_collapsed_to_unique=False, 
         include_zeros=False, verbosity=1, OUTPUT=sys.stdout):
    """ Given a list of fastq/fasta files, return total seq number, a length:N dict and formatted info (optionally print).
    
    If total_seq_number_only is True, only return/print total seq count.
    If input_collapsed_to_unique is True, program assumes infile was preprocessed with fastx_collapser, 
     and attempts to give original pre-collapsing seq_counts (based on headers).
    If include_zeros is False (default), only print non-zero seq counts; if True, print seq counts for 
     all lengths between min and max length, even if they're 0.
    Verbosity: if >1, print filetype and seqcount for each input file; if 0, don't print header or summary.
    Prints to stdout by default; to print to file, pass open file object as OUTPUT; to suppress printing, pass None."""

    # a counter with a default value of 0
    total_seqcount, total_seqlen_dict = 0, {}
    formatted_output = []
    # add the numbers from each file to total_seqlen_dict
    for infile in infiles:
        # detect filetype based on extension
        #  MAYBE-TODO add command-line options that force the format to fasta/fastq instead of checking by extension?
        seq_format = check_fasta_fastq_format(infile, verbosity>1)
        # note: just using plain "fastq" quality encoding, because we're not dealing with qualities so it doesn't matter
        with open(infile) as INFILE:
            file_seqcount, file_seqlen_dict = seq_count_and_lengths(SeqIO.parse(INFILE, seq_format), 
                                                                    total_seq_number_only, input_collapsed_to_unique)
        total_seqcount += file_seqcount
        total_seqlen_dict = add_dicts_of_ints(total_seqlen_dict, file_seqlen_dict)

    # format and print (optionally) and return the output
    if total_seq_number_only:
        formatted_output.append("Total %s seqs\n"%total_seqcount)
    else:
        formatted_output += _format_lengths(total_seqlen_dict, include_zeros, verbosity)
    if not OUTPUT is None:
        for line in formatted_output:   OUTPUT.write(line)
    return total_seqcount, total_seqlen_dict, formatted_output

Exemple #3

0

Afficher le fichier

Fichier : seq_top_sequence_check.py Projet : Jonikas-Lab/basic-bioinf-utilities

def main(infiles, args):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """

    if not infiles:
        parser.print_help()
        sys.exit(
            "\nError: at least one infile and exactly one outfile are required!"
        )

    if options.seq_length is None: seqlen_info = ''
    elif options.seq_length > 0:
        seqlen_info = ' first %sbp' % options.seq_length
    elif options.seq_length < 0:
        seqlen_info = ' last %sbp' % (-options.seq_length)

    for infile in infiles:
        seq_format = check_fasta_fastq_format(infile)

        with open(infile) as INFILE:
            seq_counter = subsequence_counts(
                SeqIO.parse(INFILE, seq_format), options.seq_length,
                options.input_collapsed_to_unique)

        seq_list_by_count = sorted(
            seq_counter.items(), key=lambda (s, c): c, reverse=True)

        total_seqs = sum(seq_counter.values())

        # if not using the min_percent_to_print option, just print the top N sequences from each file
        if options.min_percent_to_print is None:
            seq_data_list = []
            for i in range(min(options.n_to_print, len(seq_list_by_count))):
                seq, count = seq_list_by_count[i]
                percent = count * 100.0 / total_seqs
                # "%.2g" is significant-digit-based formatting of floats!!  So 92.12345 is 92%, but 0.00045 is 0.00045%.
                percent_2_sig_digits = str(float("%.2g" % percent))
                if percent_2_sig_digits.endswith(".0"):
                    percent_2_sig_digits = percent_2_sig_digits[:-2]
                seq_data_list.append(
                    "%s%% %s (%d)" % (percent_2_sig_digits, seq, count))
            print " * %s (%s seqs, %s unique%s):" % (
                infile, total_seqs, len(seq_list_by_count), seqlen_info)
            print ', '.join(seq_data_list)

        # if using the min_percent_to_print option, just print the top N sequences from each file
        else:
            print "min_percent_to_print NOT IMPLEMENTED!"

Exemple #4

0

Afficher le fichier

Fichier : deepseq_alignment_wrapper.py Projet : damluffy/Li-Zhang-Patena-2015

def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """

    try:
        [infile] = args
    except ValueError:
        parser.print_help()
        sys.exit(
            "Error: exactly one infile required! %s infiles provided: %s" %
            (len(args), args))
        # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles...

    other_bowtie_options_split = options.other_bowtie_options.split(' ')
    if any([
            x in other_bowtie_options_split
            for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' '))
    ]):
        raise Exception(
            "Cannot include -v/-n/-e and related bowtie options in -B!  Use separate -e option for that; "
            "note that this program allows -v bowtie mode only.")
    if any([
            x in other_bowtie_options_split
            for x in ('-m -k -a --all'.split(' '))
    ]):
        raise Exception(
            "Cannot include -m/-a bowtie options in -B!  Use separate -m option for that."
        )

    specific_bowtie_options = '-v %s' % options.allowed_errors
    if not any([x in options.other_bowtie_options for x in ('-f', '-q')]):
        infile_format = check_fasta_fastq_format(infile)
        if infile_format == 'fasta': specific_bowtie_options += ' -f'
        elif infile_format == 'fastq': specific_bowtie_options += ' -q'
        else:
            raise Exception("Cannot process auto-detected infile format %s!" %
                            infile_format)

    # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments
    if options.multiple_to_show == -1: multiple_bowtie_option = '-a'
    else: multiple_bowtie_option = '-k %s' % max(options.multiple_to_show, 2)

    # output file names: temporary for alignments, final (split or all), metadata info file.
    outfile_suffix = '.sam' if any(
        [x in options.other_bowtie_options
         for x in ['-S', '--sam']]) else '.map'
    tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix
    if options.cassette_bowtie_index != 'NONE':
        tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix
    if options.dont_split_by_category:
        outfile_all = options.outfile_basename + outfile_suffix
    else:
        outfile_unaligned = options.outfile_basename + '_unaligned.fa'
        outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix
        outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\
                                   + ('.fa' if options.multiple_to_show==0 else outfile_suffix)
        outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix
    infofile = options.outfile_basename + '_info.txt'

    with open(infofile, 'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE, options)

        ### run bowtie vs the main/genome index file
        # run 'bowtie --version' to get that data (print to INFOFILE but not stdout)
        INFOFILE.write('\n\n')
        run_command_print_info_output("bowtie --version",
                                      INFOFILE,
                                      printing_level=0,
                                      shell=True)
        # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE
        #   (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's
        #    an error, so I can see the error message!  Or I could try to detect whether there was an error or not
        #    based on the output contents, but that seems like unnecessary work.)
        INFOFILE.write('\n\n')
        command = "bowtie %s %s %s %s %s %s" % (
            specific_bowtie_options, multiple_bowtie_option,
            options.other_bowtie_options, options.genome_bowtie_index, infile,
            tmpfile_genome)

        if options.bowtie_aln_file_genome is None:
            run_command_print_info_output(command,
                                          INFOFILE,
                                          printing_level=(not options.quiet),
                                          shell=True)
        else:
            options.keep_tmpfiles = True
            if not os.access(options.bowtie_aln_file_genome, os.R_OK):
                raise Exception(
                    "Can't read provided options.bowtie_aln_file_genome %s!" %
                    options.bowtie_aln_file_genome)
            text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % (
                command, options.bowtie_aln_file_genome)
            print text
            INFOFILE.write('\n' + text + '\n')
            tmpfile_genome = options.bowtie_aln_file_genome

        ### run bowtie vs the cassette index file if given
        if options.cassette_bowtie_index != 'NONE':
            INFOFILE.write('\n\n')
            command = "bowtie %s %s %s %s %s %s" % (
                specific_bowtie_options, '--all', options.other_bowtie_options,
                options.cassette_bowtie_index, infile, tmpfile_cassette)
            if options.bowtie_aln_file_cassette is None:
                run_command_print_info_output(
                    command,
                    INFOFILE,
                    printing_level=(not options.quiet),
                    shell=True)
            else:
                options.keep_tmpfiles = True
                if not os.access(options.bowtie_aln_file_cassette, os.R_OK):
                    raise Exception(
                        "Can't read provided options.bowtie_aln_file_cassette %s!"
                        % options.bowtie_aln_file_cassette)
                text = "UNUSUAL RUN: Instead of running \"%s\", using file %s." % (
                    command, options.bowtie_aln_file_cassette)
                print text
                INFOFILE.write('\n' + text + '\n')
                tmpfile_cassette = options.bowtie_aln_file_cassette

        ### Check that bowtie runs worked
        missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message."
        if not os.access(tmpfile_genome, os.R_OK):
            sys.exit(missing_alnfile_text %
                     (options.genome_bowtie_index, infofile))
        if options.cassette_bowtie_index != 'NONE' and not os.access(
                tmpfile_cassette, os.R_OK):
            sys.exit(missing_alnfile_text %
                     (options.cassette_bowtie_index, infofile))
        # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1?  Hard - bowtie is unfortunately ANNOYING
        #  and uses stderr both for normal output and for errors, AND gives no returncode.

        ### Parse the two alignment files in parallel, and merge them together (remove sub-optimal alignments,
        #    (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files.
        #  Do all this WITHOUT reading the entire files into memory!  A bit tricky.
        if options.cassette_bowtie_index != 'NONE':
            aln_list_generator = aln_generator_from_two_samfiles_parallel(
                tmpfile_genome, tmpfile_cassette)
        else:
            aln_list_generator = aln_generator_from_single_samfile(
                tmpfile_genome)
        ### Decide the proper category for each read, and write the info to appropriate final output files
        if options.dont_split_by_category:
            GENOMIC_UNIQUE_FILE = MULTIPLE_GENOMIC_FILE = CASSETTE_FILE = UNALIGNED_FILE = open(
                outfile_all, 'w')
            unaligned_as_fasta = False
        else:
            UNALIGNED_FILE = open(outfile_unaligned, 'w')
            CASSETTE_FILE = open(outfile_cassette, 'w')
            MULTIPLE_GENOMIC_FILE = open(outfile_multiple_genomic, 'w')
            GENOMIC_UNIQUE_FILE = open(outfile_genomic_unique, 'w')
            unaligned_as_fasta = True
        category_readcounts = {
            'unaligned': 0,
            'cassette': 0,
            'multiple-genomic': 0,
            'genomic-unique': 0,
            'cassette-multiple': 0
        }
        for (readname, full_aln_list) in aln_list_generator:
            reduced_aln_list = reduce_alignment_list(full_aln_list)
            final_aln_list = prioritize_cassette_reads(
                reduced_aln_list, if_cassette_function=is_cassette_chromosome)
            categorize_reads_print_to_files(
                readname,
                final_aln_list,
                category_readcounts,
                UNALIGNED_FILE,
                CASSETTE_FILE,
                MULTIPLE_GENOMIC_FILE,
                GENOMIC_UNIQUE_FILE,
                unaligned_as_fasta=unaligned_as_fasta,
                multiple_to_write=options.multiple_to_show,
                input_collapsed_to_unique=options.input_collapsed_to_unique,
                no_multi_cassette_warnings=options.no_multi_cassette_warnings)
        if options.dont_split_by_category:
            # all files are actually the same pointer, so only close once
            GENOMIC_UNIQUE_FILE.close()
        else:
            UNALIGNED_FILE.close()
            CASSETTE_FILE.close()
            MULTIPLE_GENOMIC_FILE.close()
            GENOMIC_UNIQUE_FILE.close()

        # delete alignment tmpfiles now that they've been parsed
        if not options.keep_tmpfiles:
            os.remove(tmpfile_genome)
            if options.cassette_bowtie_index != 'NONE':
                os.remove(tmpfile_cassette)

        ### print category_readcounts to INFOFILE in a nice way
        text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS"
        cassette_multiple = category_readcounts.pop('cassette-multiple')
        total_reads = sum(category_readcounts.values())
        text2 = "# total reads:  %s" % total_reads
        if options.input_collapsed_to_unique:
            text2 += " (uncollapsed readcounts)"
        lines = [text1, text2]
        for category, count in sorted(category_readcounts.items()):
            text = "# %s:  %s" % (category,
                                  value_and_percentages(count, [total_reads]))
            if category == 'cassette' and cassette_multiple:
                text += ' (Warning: %s multiple!!)' % cassette_multiple
            lines.append(text)
        INFOFILE.write('\n')
        for text in lines:
            INFOFILE.write(text + '\n')
            if not options.quiet: print text

        ### copy preprocessing metadata file to the bottom of the new metadata file
        INFOFILE.write(
            "\n\n################## Metadata from input preprocessing ##################\n\n"
        )
        if options.input_metadata_file == 'NONE':
            INFOFILE.write(
                'Not looking for a metadata input file, as specified by options\n'
            )
        else:
            if options.input_metadata_file == 'AUTO':
                # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both.
                #  (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt;
                #   in the old version it was just X.txt and X_info.txt)
                # MAYBE-TODO add a test-case for this thing!  Probably too minor.
                metafile_basename = os.path.splitext(infile)[0]
                options.input_metadata_file = metafile_basename + '_info.txt'
                if not os.path.exists(options.input_metadata_file):
                    if metafile_basename.endswith(
                            '_3prime') or metafile_basename.endswith(
                                '_5prime'):
                        options.input_metadata_file = metafile_basename[:-len(
                            '_3prime')] + '_info.txt'
                text = 'Automatically determining metadata input file name: %s\n' % options.input_metadata_file
                if not options.quiet:
                    print text,
            else:
                text = 'Metadata input file name provided in options: %s\n' % options.input_metadata_file
            INFOFILE.write(text + '\n')
            if os.path.exists(options.input_metadata_file):
                print_text_from_file(options.input_metadata_file,
                                     INFOFILE,
                                     printing=False)
            else:
                text = 'Metadata input file %s not found!\n' % options.input_metadata_file
                if not options.quiet:
                    print text,
                INFOFILE.write(text)

Exemple #5

0

Afficher le fichier

Fichier : deepseq_alignment_wrapper.py Projet : Jonikas-Lab/Zhang-Patena-2014

def main(args, options):
    """ Run the main functionality of the module (see module docstring for more information), excluding testing.
    The options argument should be generated by an optparse parser.
    """

    try:
        [infile] = args
    except ValueError:
        parser.print_help()
        sys.exit("Error: exactly one infile required! %s infiles provided: %s"%(len(args), args))
        # MAYBE-TODO bowtie could take multiple infiles, but then I'd have to deal with multiple preprocessing metafiles...

    other_bowtie_options_split = options.other_bowtie_options.split(' ')
    if any([x in other_bowtie_options_split for x in ('-v -e --maqerr -n --seedmms -l --seedlen'.split(' '))]):
        raise Exception("Cannot include -v/-n/-e and related bowtie options in -B!  Use separate -e option for that; "
                        "note that this program allows -v bowtie mode only.")
    if any([x in other_bowtie_options_split for x in ('-m -k -a --all'.split(' '))]):
        raise Exception("Cannot include -m/-a bowtie options in -B!  Use separate -m option for that.")

    specific_bowtie_options = '-v %s'%options.allowed_errors
    if not any([x in options.other_bowtie_options for x in ('-f', '-q')]):
        infile_format = check_fasta_fastq_format(infile)
        if infile_format=='fasta':      specific_bowtie_options += ' -f'
        elif infile_format=='fastq':    specific_bowtie_options += ' -q'
        else:                           raise Exception("Cannot process auto-detected infile format %s!"%infile_format)

    # using a minimum of -k 2 (or -a) in order to make sure I can easily tell multiple from unique alignments
    if options.multiple_to_show == -1:  multiple_bowtie_option = '-a' 
    else:                               multiple_bowtie_option = '-k %s'%max(options.multiple_to_show, 2)

    # output file names: temporary for alignments, final (split or all), metadata info file. 
    outfile_suffix = '.sam' if any([x in options.other_bowtie_options for x in ['-S','--sam']]) else '.map'
    tmpfile_genome = options.outfile_basename + '_tmp_genome' + outfile_suffix
    if options.cassette_bowtie_index != 'NONE':
        tmpfile_cassette = options.outfile_basename + '_tmp_cassette' + outfile_suffix
    if options.dont_split_by_category:
        outfile_all = options.outfile_basename + outfile_suffix
    else:
        outfile_unaligned = options.outfile_basename + '_unaligned.fa'
        outfile_cassette = options.outfile_basename + '_cassette' + outfile_suffix
        outfile_multiple_genomic = options.outfile_basename + '_multiple-genomic'\
                                   + ('.fa' if options.multiple_to_show==0 else outfile_suffix)
        outfile_genomic_unique = options.outfile_basename + '_genomic-unique' + outfile_suffix
    infofile = options.outfile_basename + '_info.txt'

    with open(infofile,'w') as INFOFILE:

        ### write header data
        write_header_data(INFOFILE,options)

        ### run bowtie vs the main/genome index file
        # run 'bowtie --version' to get that data (print to INFOFILE but not stdout)
        INFOFILE.write('\n\n')
        run_command_print_info_output("bowtie --version", INFOFILE, printing_level=0, shell=True)
        # run the actual bowtie alignment command; always print output to stdout as well as INFOFILE
        #   (bowtie actually prints the summary to stderr, not stdout, so I need to print it to stdout in case there's 
        #    an error, so I can see the error message!  Or I could try to detect whether there was an error or not
        #    based on the output contents, but that seems like unnecessary work.)
        INFOFILE.write('\n\n')
        command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, multiple_bowtie_option, 
                                      options.other_bowtie_options, options.genome_bowtie_index, infile, tmpfile_genome)
        run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True)

        ### run bowtie vs the cassette index file if given
        if options.cassette_bowtie_index != 'NONE':
            INFOFILE.write('\n\n')
            command = "bowtie %s %s %s %s %s %s"%(specific_bowtie_options, '--all', options.other_bowtie_options, 
                                                  options.cassette_bowtie_index, infile, tmpfile_cassette)
            run_command_print_info_output(command, INFOFILE, printing_level=(not options.quiet), shell=True)

        ### Check that bowtie runs worked
        missing_alnfile_text = "Bowtie run against %s failed! See above or %s file for bowtie error message."
        if not os.access(tmpfile_genome, os.R_OK):
            sys.exit(missing_alnfile_text%(options.genome_bowtie_index, infofile))
        if options.cassette_bowtie_index != 'NONE' and not os.access(tmpfile_cassette, os.R_OK):
            sys.exit(missing_alnfile_text%(options.cassette_bowtie_index, infofile))
        # MAYBE-TODO make sure bowtie errors are printed to stdout even with -1?  Hard - bowtie is unfortunately ANNOYING 
        #  and uses stderr both for normal output and for errors, AND gives no returncode. 

        ### Parse the two alignment files, and merge them together (remove sub-optimal alignments,
        #    (and remove non-cassette ones if there are cassette ones with equal quality); remove alignment files.
        readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_genome)
        if options.cassette_bowtie_index != 'NONE':
            readname_to_aln_list = make_aln_dict_from_samfile(tmpfile_cassette, starting_dict=readname_to_aln_list)
        # MAYBE-TODO right now I'm reading the entire files into memory before merging and processing them, 
        #  which takes a fair amount of memory - could instead write something that would read both alignment files
        #  in parallel and do the merging and output-writing read-by-read.  Do that if I start getting memory issues.
        reduce_alignment_dict(readname_to_aln_list)
        prioritize_cassette_reads(readname_to_aln_list, if_cassette_function=is_cassette_chromosome)
        # delete alignment tmpfiles now that they've been parsed
        os.remove(tmpfile_genome)
        if options.cassette_bowtie_index != 'NONE':
            os.remove(tmpfile_cassette)

        ### Decide the proper category for each read, and write the info to appropriate final output files
        if options.dont_split_by_category:
            with open(outfile_all,'w') as ALL_FILE:
                category_counts = categorize_reads_print_to_files(readname_to_aln_list, ALL_FILE, ALL_FILE, ALL_FILE, 
                                          ALL_FILE, unaligned_as_fasta=False, multiple_to_write=options.multiple_to_show, 
                                          input_collapsed_to_unique=options.input_collapsed_to_unique, 
                                          no_warnings=options.quiet)
        else:
            with open(outfile_unaligned, 'w') as UNALIGNED_FILE:
                with open(outfile_cassette, 'w') as CASSETTE_FILE:
                    with open(outfile_multiple_genomic, 'w') as MULTIPLE_GENOMIC_FILE:
                        with open(outfile_genomic_unique, 'w') as GENOMIC_UNIQUE_FILE:
                            category_counts = categorize_reads_print_to_files(readname_to_aln_list, UNALIGNED_FILE, 
                                                      CASSETTE_FILE, MULTIPLE_GENOMIC_FILE, GENOMIC_UNIQUE_FILE, 
                                                      unaligned_as_fasta=True, multiple_to_write=options.multiple_to_show, 
                                                      input_collapsed_to_unique=options.input_collapsed_to_unique, 
                                                      no_warnings=options.quiet)

        ### print category_readcounts to INFOFILE in a nice way
        text1 = "\n### FINAL ALIGNMENT CATEGORY COUNTS"
        cassette_multiple = category_counts.pop('cassette-multiple')
        total_reads = sum(category_counts.values())
        text2 = "# total reads:  %s"%total_reads
        if options.input_collapsed_to_unique: text2 +=" (uncollapsed readcounts)"
        lines = [text1, text2]
        for category,count in sorted(category_counts.items()):
            text = "# %s:  %s"%(category, value_and_percentages(count, [total_reads]))
            if category=='cassette' and cassette_multiple:  
                text += ' (Warning: %s multiple!!)'%cassette_multiple
            lines.append(text)
        INFOFILE.write('\n')
        for text in lines:
            INFOFILE.write(text + '\n')
            if not options.quiet: print text

        ### copy preprocessing metadata file to the bottom of the new metadata file
        INFOFILE.write("\n\n################## Metadata from input preprocessing ##################\n\n")
        if options.input_metadata_file == 'NONE':
            INFOFILE.write('Not looking for a metadata input file, as specified by options\n')
        else:
            if options.input_metadata_file == 'AUTO':
                # the correct info file for X.txt is X.fa, but for X_5prime.txt it can be either X_5prime.txt or X.txt, so try both.
                #  (in the new preprocessing version all files are X_*prime.txt and the info files are X_info.txt; 
                #   in the old version it was just X.txt and X_info.txt)
                # MAYBE-TODO add a test-case for this thing!  Probably too minor.
                metafile_basename = os.path.splitext(infile)[0] 
                options.input_metadata_file = metafile_basename + '_info.txt'
                if not os.path.exists(options.input_metadata_file):
                    if metafile_basename.endswith('_3prime') or metafile_basename.endswith('_5prime'):
                        options.input_metadata_file = metafile_basename[:-len('_3prime')] + '_info.txt'
                text = 'Automatically determining metadata input file name: %s\n'%options.input_metadata_file
                if not options.quiet:
                    print text,
            else:
                text = 'Metadata input file name provided in options: %s\n'%options.input_metadata_file
            INFOFILE.write(text+'\n')
            if os.path.exists(options.input_metadata_file):
                print_text_from_file(options.input_metadata_file, INFOFILE, printing=False)
            else:
                text = 'Metadata input file %s not found!\n'%options.input_metadata_file
                if not options.quiet:
                    print text,
                INFOFILE.write(text)