Esempio n. 1
0
def check_binary(name, file_path: bool = True) -> bool:
    # Handles files if file_path is True or text if file_path is False
    temp: IO[bytes]
    if file_path:
        temp = open(name, "rb")
        size = os.stat(name).st_size
    else:
        temp = BytesIO(name)
        size = len(name)
    read_start = int(size / 2)
    read_length = 1024
    try:
        if util.is_binary(temp.read(read_length)):
            return True
        # Some binary files have text only within the first 1024
        # Read 1024 from the middle of the file if this is not
        # a gzip or zip compressed file (bzip are indexed),
        # to avoid issues with long txt headers on binary files.
        if file_path and not is_gzip(name) and not is_zip(name) and not is_bz2(
                name):
            # file_path=False doesn't seem to be used in the codebase
            temp.seek(read_start)
            return util.is_binary(temp.read(read_length))
        return False
    finally:
        temp.close()
 def set_meta(self, dataset, **kwd):
     if dataset.has_data():
         dataset.metadata.field_names = []
         dataset.metadata.field_components = {}
         dataset_type = None
         field_components = {}
         dataset_structure_complete = False
         with open(dataset.file_name) as fh:
             for i, line in enumerate(fh):
                 line = line.strip()
                 if not line:
                     continue
                 if i < 3:
                     dataset = self.set_initial_metadata(i, line, dataset)
                 elif dataset.metadata.file_format == 'ASCII' or not util.is_binary(line):
                     if dataset_structure_complete:
                         dataset, field_components = self.set_dataset_attributes_metadata(line,
                                                                                          dataset,
                                                                                          field_components)
                     elif line.startswith('POINT_DATA') or line.startswith('CELL_DATA'):
                         dataset_structure_complete = True
                         dataset, field_components = self.set_dataset_attributes_metadata(line,
                                                                                          dataset,
                                                                                          field_components)
                     else:
                         dataset, dataset_type = self.set_dataset_structure_metadata(line,
                                                                                     dataset,
                                                                                     dataset_type)
         if len(field_components) > 0:
             dataset.metadata.field_components = field_components
Esempio n. 3
0
def check_binary(name, file_path=True):
    # Handles files if file_path is True or text if file_path is False
    if file_path:
        temp = open(name, "rb")
    else:
        temp = BytesIO(name)
    try:
        return util.is_binary(temp.read(1024))
    finally:
        temp.close()
Esempio n. 4
0
def check_binary(name, file_path=True):
    # Handles files if file_path is True or text if file_path is False
    if file_path:
        temp = open(name, "rb")
    else:
        temp = BytesIO(name)
    try:
        return util.is_binary(temp.read(1024))
    finally:
        temp.close()
Esempio n. 5
0
def stream_to_open_named_file(stream,
                              fd,
                              filename,
                              source_encoding=None,
                              source_error='strict',
                              target_encoding=None,
                              target_error='strict'):
    """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor"""
    # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    is_multi_byte = False
    try:
        codecs.lookup(target_encoding)
    except Exception:
        target_encoding = util.DEFAULT_ENCODING  # utf-8
    if not source_encoding:
        source_encoding = util.DEFAULT_ENCODING  # sys.getdefaultencoding() would mimic old behavior (defaults to ascii)
    while True:
        chunk = stream.read(CHUNK_SIZE)
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            if zipfile.is_zipfile(filename):
                is_compressed = True
            else:
                try:
                    if text_type(chunk[:2]) == text_type(util.gzip_magic):
                        is_compressed = True
                except Exception:
                    pass
            if not is_compressed:
                # See if we have a multi-byte character file
                chars = chunk[:100]
                is_multi_byte = multi_byte.is_multi_byte(chars)
                if not is_multi_byte:
                    is_binary = util.is_binary(chunk)
            data_checked = True
        if not is_compressed and not is_binary:
            if not isinstance(chunk, text_type):
                chunk = chunk.decode(source_encoding, source_error)
            os.write(fd, chunk.encode(target_encoding, target_error))
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            os.write(fd, chunk)
    os.close(fd)
    return filename, is_multi_byte
Esempio n. 6
0
def check_binary( name, file_path=True ):
    # Handles files if file_path is True or text if file_path is False
    is_binary = False
    if file_path:
        temp = open( name, "U" )
    else:
        temp = StringIO( name )
    try:
        for char in temp.read( 100 ):
            if util.is_binary( char ):
                is_binary = True
                break
    finally:
        temp.close( )
    return is_binary
Esempio n. 7
0
def check_binary(name, file_path=True):
    # Handles files if file_path is True or text if file_path is False
    is_binary = False
    if file_path:
        temp = open(name, "U")
    else:
        temp = StringIO(name)
    try:
        for char in temp.read(100):
            if util.is_binary(char):
                is_binary = True
                break
    finally:
        temp.close()
    return is_binary
Esempio n. 8
0
def stream_to_open_named_file(stream,
                              fd,
                              filename,
                              source_encoding=None,
                              source_error='strict',
                              target_encoding=None,
                              target_error='strict'):
    """Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor"""
    # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    try:
        codecs.lookup(target_encoding)
    except Exception:
        target_encoding = util.DEFAULT_ENCODING  # utf-8
    if not source_encoding:
        source_encoding = util.DEFAULT_ENCODING  # sys.getdefaultencoding() would mimic old behavior (defaults to ascii)
    while True:
        chunk = stream.read(CHUNK_SIZE)
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            try:
                # Convert chunk to a bytestring if it is not already.
                # Check if the first 2 bytes of the chunk are equal to the
                # gzip magic number.
                if smart_str(chunk)[:2] == util.gzip_magic:
                    is_compressed = True
            except Exception:
                pass
            if not is_compressed:
                is_binary = util.is_binary(chunk)
            data_checked = True
        if not is_compressed and not is_binary:
            if not isinstance(chunk, text_type):
                chunk = chunk.decode(source_encoding, source_error)
            os.write(fd, chunk.encode(target_encoding, target_error))
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            if isinstance(chunk, text_type):
                chunk = chunk.encode(target_encoding, target_error)
            os.write(fd, chunk)
    os.close(fd)
    return filename
Esempio n. 9
0
def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'):
    """Writes a stream to the provided file descriptor, returns the file's name and bool( is_multi_byte ). Closes file descriptor"""
    # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    is_multi_byte = False
    try:
        codecs.lookup(target_encoding)
    except:
        target_encoding = util.DEFAULT_ENCODING  # utf-8
    if not source_encoding:
        source_encoding = util.DEFAULT_ENCODING  # sys.getdefaultencoding() would mimic old behavior (defaults to ascii)
    while True:
        chunk = stream.read(CHUNK_SIZE)
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            if zipfile.is_zipfile(filename):
                is_compressed = True
            else:
                try:
                    if text_type(chunk[:2]) == text_type(util.gzip_magic):
                        is_compressed = True
                except:
                    pass
            if not is_compressed:
                # See if we have a multi-byte character file
                chars = chunk[:100]
                is_multi_byte = multi_byte.is_multi_byte(chars)
                if not is_multi_byte:
                    is_binary = util.is_binary(chunk)
            data_checked = True
        if not is_compressed and not is_binary:
            if not isinstance(chunk, text_type):
                chunk = chunk.decode(source_encoding, source_error)
            os.write(fd, chunk.encode(target_encoding, target_error))
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            os.write(fd, chunk)
    os.close(fd)
    return filename, is_multi_byte
Esempio n. 10
0
def check_binary( name, file_path=True ):
    # Handles files if file_path is True or text if file_path is False
    is_binary = False
    if file_path:
        temp = open( name, "U" )
    else:
        temp = name
    chars_read = 0
    for chars in temp:
        for char in chars:
            chars_read += 1
            if util.is_binary( char ):
                is_binary = True
                break
            if chars_read > 100:
                break
        if chars_read > 100:
            break
    if file_path:
        temp.close()
    return is_binary
Esempio n. 11
0
def check_binary(name, file_path=True):
    # Handles files if file_path is True or text if file_path is False
    is_binary = False
    if file_path:
        temp = open(name, "U")
    else:
        temp = name
    chars_read = 0
    for chars in temp:
        for char in chars:
            chars_read += 1
            if util.is_binary(char):
                is_binary = True
                break
            if chars_read > 100:
                break
        if chars_read > 100:
            break
    if file_path:
        temp.close()
    return is_binary
Esempio n. 12
0
def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'):
    """Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor"""
    # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
    CHUNK_SIZE = 1048576
    data_checked = False
    is_compressed = False
    is_binary = False
    try:
        codecs.lookup(target_encoding)
    except Exception:
        target_encoding = util.DEFAULT_ENCODING  # utf-8
    if not source_encoding:
        source_encoding = util.DEFAULT_ENCODING  # sys.getdefaultencoding() would mimic old behavior (defaults to ascii)
    while True:
        chunk = stream.read(CHUNK_SIZE)
        if not chunk:
            break
        if not data_checked:
            # See if we're uploading a compressed file
            try:
                # Convert chunk to a bytestring if it is not already.
                # Check if the first 2 bytes of the chunk are equal to the
                # gzip magic number.
                if smart_str(chunk)[:2] == util.gzip_magic:
                    is_compressed = True
            except Exception:
                pass
            if not is_compressed:
                is_binary = util.is_binary(chunk)
            data_checked = True
        if not is_compressed and not is_binary:
            if not isinstance(chunk, text_type):
                chunk = chunk.decode(source_encoding, source_error)
            os.write(fd, chunk.encode(target_encoding, target_error))
        else:
            # Compressed files must be encoded after they are uncompressed in the upload utility,
            # while binary files should not be encoded at all.
            os.write(fd, chunk)
    os.close(fd)
    return filename
Esempio n. 13
0
def guess_ext( fname, sniff_order=None, is_multi_byte=False ):
    """
    Returns an extension that can be used in the datatype factory to
    generate a data for the 'fname' file

    >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
    >>> guess_ext(fname)
    'xml'
    >>> fname = get_test_fname('interval.interval')
    >>> guess_ext(fname)
    'interval'
    >>> fname = get_test_fname('interval1.bed')
    >>> guess_ext(fname)
    'bed'
    >>> fname = get_test_fname('test_tab.bed')
    >>> guess_ext(fname)
    'bed'
    >>> fname = get_test_fname('sequence.maf')
    >>> guess_ext(fname)
    'maf'
    >>> fname = get_test_fname('sequence.fasta')
    >>> guess_ext(fname)
    'fasta'
    >>> fname = get_test_fname('file.html')
    >>> guess_ext(fname)
    'html'
    >>> fname = get_test_fname('test.gtf')
    >>> guess_ext(fname)
    'gtf'
    >>> fname = get_test_fname('test.gff')
    >>> guess_ext(fname)
    'gff'
    >>> fname = get_test_fname('gff_version_3.gff')
    >>> guess_ext(fname)
    'gff3'
    >>> fname = get_test_fname('temp.txt')
    >>> file(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0")
    >>> guess_ext(fname)
    'tabular'
    >>> fname = get_test_fname('temp.txt')
    >>> file(fname, 'wt').write("a 1 2 x\\nb 3 4 y\\nc 5 6 z")
    >>> guess_ext(fname)
    'txt'
    >>> fname = get_test_fname('test_tab1.tabular')
    >>> guess_ext(fname)
    'tabular'
    >>> fname = get_test_fname('alignment.lav')
    >>> guess_ext(fname)
    'lav'
    >>> fname = get_test_fname('1.sff')
    >>> guess_ext(fname)
    'sff'
    >>> fname = get_test_fname('1.bam')
    >>> guess_ext(fname)
    'bam'
    >>> fname = get_test_fname('3unsorted.bam')
    >>> guess_ext(fname)
    'bam'
    """
    if sniff_order is None:
        datatypes_registry = registry.Registry()
        datatypes_registry.load_datatypes()
        sniff_order = datatypes_registry.sniff_order
    for datatype in sniff_order:
        """
        Some classes may not have a sniff function, which is ok.  In fact, the
        Tabular and Text classes are 2 examples of classes that should never have
        a sniff function.  Since these classes are default classes, they contain
        few rules to filter out data of other formats, so they should be called
        from this function after all other datatypes in sniff_order have not been
        successfully discovered.
        """
        try:
            if datatype.sniff( fname ):
                return datatype.file_ext
        except:
            pass
    headers = get_headers( fname, None )
    is_binary = False
    if is_multi_byte:
        is_binary = False
    else:
        for hdr in headers:
            for char in hdr:
                #old behavior had 'char' possibly having length > 1,
                #need to determine when/if this occurs 
                is_binary = util.is_binary( char )
                if is_binary:
                    break
            if is_binary:
                break
    if is_binary:
        return 'data'        #default binary data type file extension
    if is_column_based( fname, '\t', 1, is_multi_byte=is_multi_byte ):
        return 'tabular'    #default tabular data type file extension
    return 'txt'            #default text data type file extension
Esempio n. 14
0
def guess_ext(fname, sniff_order, is_multi_byte=False):
    """
    Returns an extension that can be used in the datatype factory to
    generate a data for the 'fname' file

    >>> from galaxy.datatypes import registry
    >>> sample_conf = os.path.join(util.galaxy_directory(), "config", "datatypes_conf.xml.sample")
    >>> datatypes_registry = registry.Registry()
    >>> datatypes_registry.load_datatypes(root_dir=util.galaxy_directory(), config=sample_conf)
    >>> sniff_order = datatypes_registry.sniff_order
    >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
    >>> guess_ext(fname, sniff_order)
    'blastxml'
    >>> fname = get_test_fname('interval.interval')
    >>> guess_ext(fname, sniff_order)
    'interval'
    >>> fname = get_test_fname('interval1.bed')
    >>> guess_ext(fname, sniff_order)
    'bed'
    >>> fname = get_test_fname('test_tab.bed')
    >>> guess_ext(fname, sniff_order)
    'bed'
    >>> fname = get_test_fname('sequence.maf')
    >>> guess_ext(fname, sniff_order)
    'maf'
    >>> fname = get_test_fname('sequence.fasta')
    >>> guess_ext(fname, sniff_order)
    'fasta'
    >>> fname = get_test_fname('file.html')
    >>> guess_ext(fname, sniff_order)
    'html'
    >>> fname = get_test_fname('test.gtf')
    >>> guess_ext(fname, sniff_order)
    'gtf'
    >>> fname = get_test_fname('test.gff')
    >>> guess_ext(fname, sniff_order)
    'gff'
    >>> fname = get_test_fname('gff_version_3.gff')
    >>> guess_ext(fname, sniff_order)
    'gff3'
    >>> fname = get_test_fname('temp.txt')
    >>> open(fname, 'wt').write("a\\t2")
    >>> guess_ext(fname, sniff_order)
    'txt'
    >>> fname = get_test_fname('temp.txt')
    >>> open(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0")
    >>> guess_ext(fname, sniff_order)
    'tabular'
    >>> fname = get_test_fname('temp.txt')
    >>> open(fname, 'wt').write("a 1 2 x\\nb 3 4 y\\nc 5 6 z")
    >>> guess_ext(fname, sniff_order)
    'txt'
    >>> fname = get_test_fname('test_tab1.tabular')
    >>> guess_ext(fname, sniff_order)
    'tabular'
    >>> fname = get_test_fname('alignment.lav')
    >>> guess_ext(fname, sniff_order)
    'lav'
    >>> fname = get_test_fname('1.sff')
    >>> guess_ext(fname, sniff_order)
    'sff'
    >>> fname = get_test_fname('1.bam')
    >>> guess_ext(fname, sniff_order)
    'bam'
    >>> fname = get_test_fname('3unsorted.bam')
    >>> guess_ext(fname, sniff_order)
    'bam'
    >>> fname = get_test_fname('test.idpDB')
    >>> guess_ext(fname, sniff_order)
    'idpdb'
    >>> fname = get_test_fname('test.mz5')
    >>> guess_ext(fname, sniff_order)
    'h5'
    >>> fname = get_test_fname('issue1818.tabular')
    >>> guess_ext(fname, sniff_order)
    'tabular'
    >>> fname = get_test_fname('drugbank_drugs.cml')
    >>> guess_ext(fname, sniff_order)
    'cml'
    >>> fname = get_test_fname('q.fps')
    >>> guess_ext(fname, sniff_order)
    'fps'
    >>> fname = get_test_fname('drugbank_drugs.inchi')
    >>> guess_ext(fname, sniff_order)
    'inchi'
    >>> fname = get_test_fname('drugbank_drugs.mol2')
    >>> guess_ext(fname, sniff_order)
    'mol2'
    >>> fname = get_test_fname('drugbank_drugs.sdf')
    >>> guess_ext(fname, sniff_order)
    'sdf'
    >>> fname = get_test_fname('5e5z.pdb')
    >>> guess_ext(fname, sniff_order)
    'pdb'
    >>> fname = get_test_fname('mothur_datatypetest_true.mothur.otu')
    >>> guess_ext(fname, sniff_order)
    'mothur.otu'
    >>> fname = get_test_fname('1.gg')
    >>> guess_ext(fname, sniff_order)
    'gg'
    >>> fname = get_test_fname('diamond_db.dmnd')
    >>> guess_ext(fname, sniff_order)
    'dmnd'
    >>> fname = get_test_fname('1.xls')
    >>> guess_ext(fname, sniff_order)
    'excel.xls'
    >>> fname = get_test_fname('biom2_sparse_otu_table_hdf5.biom')
    >>> guess_ext(fname, sniff_order)
    'biom2'
    """
    file_ext = None
    for datatype in sniff_order:
        """
        Some classes may not have a sniff function, which is ok.  In fact, the
        Tabular and Text classes are 2 examples of classes that should never have
        a sniff function.  Since these classes are default classes, they contain
        few rules to filter out data of other formats, so they should be called
        from this function after all other datatypes in sniff_order have not been
        successfully discovered.
        """
        try:
            if datatype.sniff(fname):
                file_ext = datatype.file_ext
                break
        except:
            pass
    # Ugly hack for tsv vs tabular sniffing, we want to prefer tabular
    # to tsv but it doesn't have a sniffer - is TSV was sniffed just check
    # if it is an okay tabular and use that instead.
    if file_ext == 'tsv':
        if is_column_based(fname, '\t', 1, is_multi_byte=is_multi_byte):
            file_ext = 'tabular'
    if file_ext is not None:
        return file_ext

    headers = get_headers(fname, None)
    is_binary = False
    if is_multi_byte:
        is_binary = False
    else:
        for hdr in headers:
            for char in hdr:
                # old behavior had 'char' possibly having length > 1,
                # need to determine when/if this occurs
                is_binary = util.is_binary(char)
                if is_binary:
                    break
            if is_binary:
                break
    if is_binary:
        return 'data'  # default binary data type file extension
    if is_column_based(fname, '\t', 1, is_multi_byte=is_multi_byte):
        return 'tabular'  # default tabular data type file extension
    return 'txt'  # default text data type file extension
Esempio n. 15
0
def guess_ext(fname, sniff_order, is_multi_byte=False):
    """
    Returns an extension that can be used in the datatype factory to
    generate a data for the 'fname' file

    >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
    >>> from galaxy.datatypes import registry
    >>> sample_conf = os.path.join(util.galaxy_directory(), "config", "datatypes_conf.xml.sample")
    >>> datatypes_registry = registry.Registry()
    >>> datatypes_registry.load_datatypes(root_dir=util.galaxy_directory(), config=sample_conf)
    >>> sniff_order = datatypes_registry.sniff_order
    >>> guess_ext(fname, sniff_order)
    'xml'
    >>> fname = get_test_fname('interval.interval')
    >>> guess_ext(fname, sniff_order)
    'interval'
    >>> fname = get_test_fname('interval1.bed')
    >>> guess_ext(fname, sniff_order)
    'bed'
    >>> fname = get_test_fname('test_tab.bed')
    >>> guess_ext(fname, sniff_order)
    'bed'
    >>> fname = get_test_fname('sequence.maf')
    >>> guess_ext(fname, sniff_order)
    'maf'
    >>> fname = get_test_fname('sequence.fasta')
    >>> guess_ext(fname, sniff_order)
    'fasta'
    >>> fname = get_test_fname('file.html')
    >>> guess_ext(fname, sniff_order)
    'html'
    >>> fname = get_test_fname('test.gtf')
    >>> guess_ext(fname, sniff_order)
    'gtf'
    >>> fname = get_test_fname('test.gff')
    >>> guess_ext(fname, sniff_order)
    'gff'
    >>> fname = get_test_fname('gff_version_3.gff')
    >>> guess_ext(fname, sniff_order)
    'gff3'
    >>> fname = get_test_fname('temp.txt')
    >>> open(fname, 'wt').write("a\\t2")
    >>> guess_ext(fname, sniff_order)
    'txt'
    >>> fname = get_test_fname('temp.txt')
    >>> open(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0")
    >>> guess_ext(fname, sniff_order)
    'tabular'
    >>> fname = get_test_fname('temp.txt')
    >>> open(fname, 'wt').write("a 1 2 x\\nb 3 4 y\\nc 5 6 z")
    >>> guess_ext(fname, sniff_order)
    'txt'
    >>> fname = get_test_fname('test_tab1.tabular')
    >>> guess_ext(fname, sniff_order)
    'tabular'
    >>> fname = get_test_fname('alignment.lav')
    >>> guess_ext(fname, sniff_order)
    'lav'
    >>> fname = get_test_fname('1.sff')
    >>> guess_ext(fname, sniff_order)
    'sff'
    >>> fname = get_test_fname('1.bam')
    >>> guess_ext(fname, sniff_order)
    'bam'
    >>> fname = get_test_fname('3unsorted.bam')
    >>> guess_ext(fname, sniff_order)
    'bam'
    >>> fname = get_test_fname('test.idpDB')
    >>> guess_ext(fname, sniff_order)
    'idpdb'
    >>> fname = get_test_fname('test.mz5')
    >>> guess_ext(fname, sniff_order)
    'h5'
    >>> fname = get_test_fname('issue1818.tabular')
    >>> guess_ext(fname, sniff_order)
    'tabular'
    >>> fname = get_test_fname('drugbank_drugs.cml')
    >>> guess_ext(fname, sniff_order)
    'cml'
    >>> fname = get_test_fname('q.fps')
    >>> guess_ext(fname, sniff_order)
    'fps'
    >>> fname = get_test_fname('drugbank_drugs.inchi')
    >>> guess_ext(fname, sniff_order)
    'inchi'
    >>> fname = get_test_fname('drugbank_drugs.mol2')
    >>> guess_ext(fname, sniff_order)
    'mol2'
    >>> fname = get_test_fname('drugbank_drugs.sdf')
    >>> guess_ext(fname, sniff_order)
    'sdf'
    >>> fname = get_test_fname('5e5z.pdb')
    >>> guess_ext(fname, sniff_order)
    'pdb'
    >>> fname = get_test_fname('mothur_datatypetest_true.mothur.otu')
    >>> guess_ext(fname, sniff_order)
    'mothur.otu'
    """
    file_ext = None
    for datatype in sniff_order:
        """
        Some classes may not have a sniff function, which is ok.  In fact, the
        Tabular and Text classes are 2 examples of classes that should never have
        a sniff function.  Since these classes are default classes, they contain
        few rules to filter out data of other formats, so they should be called
        from this function after all other datatypes in sniff_order have not been
        successfully discovered.
        """
        try:
            if datatype.sniff(fname):
                file_ext = datatype.file_ext
                break
        except:
            pass
    # Ugly hack for tsv vs tabular sniffing, we want to prefer tabular
    # to tsv but it doesn't have a sniffer - is TSV was sniffed just check
    # if it is an okay tabular and use that instead.
    if file_ext == 'tsv':
        if is_column_based(fname, '\t', 1, is_multi_byte=is_multi_byte):
            file_ext = 'tabular'
    if file_ext is not None:
        return file_ext

    headers = get_headers(fname, None)
    is_binary = False
    if is_multi_byte:
        is_binary = False
    else:
        for hdr in headers:
            for char in hdr:
                # old behavior had 'char' possibly having length > 1,
                # need to determine when/if this occurs
                is_binary = util.is_binary(char)
                if is_binary:
                    break
            if is_binary:
                break
    if is_binary:
        return 'data'  # default binary data type file extension
    if is_column_based(fname, '\t', 1, is_multi_byte=is_multi_byte):
        return 'tabular'  # default tabular data type file extension
    return 'txt'  # default text data type file extension
def guess_ext( fname, sniff_order=None, is_multi_byte=False ):
    """
    Returns an extension that can be used in the datatype factory to
    generate a data for the 'fname' file

    >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
    >>> guess_ext(fname)
    'xml'
    >>> fname = get_test_fname('interval.interval')
    >>> guess_ext(fname)
    'interval'
    >>> fname = get_test_fname('interval1.bed')
    >>> guess_ext(fname)
    'bed'
    >>> fname = get_test_fname('test_tab.bed')
    >>> guess_ext(fname)
    'bed'
    >>> fname = get_test_fname('sequence.maf')
    >>> guess_ext(fname)
    'maf'
    >>> fname = get_test_fname('sequence.fasta')
    >>> guess_ext(fname)
    'fasta'
    >>> fname = get_test_fname('file.html')
    >>> guess_ext(fname)
    'html'
    >>> fname = get_test_fname('test.gtf')
    >>> guess_ext(fname)
    'gtf'
    >>> fname = get_test_fname('test.gff')
    >>> guess_ext(fname)
    'gff'
    >>> fname = get_test_fname('gff_version_3.gff')
    >>> guess_ext(fname)
    'gff3'
    >>> fname = get_test_fname('temp.txt')
    >>> file(fname, 'wt').write("a\\t2\\nc\\t1\\nd\\t0")
    >>> guess_ext(fname)
    'tabular'
    >>> fname = get_test_fname('temp.txt')
    >>> file(fname, 'wt').write("a 1 2 x\\nb 3 4 y\\nc 5 6 z")
    >>> guess_ext(fname)
    'txt'
    >>> fname = get_test_fname('test_tab1.tabular')
    >>> guess_ext(fname)
    'tabular'
    >>> fname = get_test_fname('alignment.lav')
    >>> guess_ext(fname)
    'lav'
    >>> fname = get_test_fname('1.sff')
    >>> guess_ext(fname)
    'sff'
    >>> fname = get_test_fname('1.bam')
    >>> guess_ext(fname)
    'bam'
    >>> fname = get_test_fname('3unsorted.bam')
    >>> guess_ext(fname)
    'bam'
    """
    if sniff_order is None:
        datatypes_registry = registry.Registry()
        datatypes_registry.load_datatypes()
        sniff_order = datatypes_registry.sniff_order
    for datatype in sniff_order:
        """
        Some classes may not have a sniff function, which is ok.  In fact, the
        Tabular and Text classes are 2 examples of classes that should never have
        a sniff function.  Since these classes are default classes, they contain
        few rules to filter out data of other formats, so they should be called
        from this function after all other datatypes in sniff_order have not been
        successfully discovered.
        """
        try:
            if datatype.sniff( fname ):
                return datatype.file_ext
        except:
            pass
    headers = get_headers( fname, None )
    is_binary = False
    if is_multi_byte:
        is_binary = False
    else:
        for hdr in headers:
            for char in hdr:
                #old behavior had 'char' possibly having length > 1,
                #need to determine when/if this occurs
                is_binary = util.is_binary( char )
                if is_binary:
                    break
            if is_binary:
                break
    if is_binary:
        return 'data'        #default binary data type file extension
    if is_column_based( fname, '\t', 1, is_multi_byte=is_multi_byte ):
        return 'tabular'    #default tabular data type file extension
    return 'txt'            #default text data type file extension
    def set_meta(self, dataset, **kwd):
        if dataset.has_data():
            dataset.metadata.field_names = []
            dataset.metadata.field_components = {}
            dataset_type = None
            field_components = {}
            dataset_structure_complete = False
            processing_field_section = False
            with open(dataset.file_name) as fh:
                for i, line in enumerate(fh):
                    line = line.strip()
                    if not line:
                        continue
                    if i < 3:
                        dataset = self.set_initial_metadata(i, line, dataset)
                    elif dataset.metadata.file_format == 'ASCII' or not util.is_binary(line):
                        if dataset_structure_complete:
                            """
                            The final part of legacy VTK files describes the dataset attributes.
                            This part begins with the keywords POINT_DATA or CELL_DATA, followed
                            by an integer number specifying the number of points or cells,
                            respectively. Other keyword/data combinations then define the actual
                            dataset attribute values (i.e., scalars, vectors, tensors, normals,
                            texture coordinates, or field data).  Dataset attributes are supported
                            for both points and cells.

                            Each type of attribute data has a dataName associated with it. This is
                            a character string (without embedded whitespace) used to identify a
                            particular data.  The dataName is used by the VTK readers to extract
                            data. As a result, more than one attribute data of the same type can be
                            included in a file.  For example, two different scalar fields defined
                            on the dataset points, pressure and temperature, can be contained in
                            the same file.  If the appropriate dataName is not specified in the VTK
                            reader, then the first data of that type is extracted from the file.
                            """
                            items = line.split()
                            if items[0] == 'SCALARS':
                                # Example: SCALARS surface_field double 3
                                # Scalar definition includes specification of a lookup table. The
                                # definition of a lookup table is optional. If not specified, the
                                # default VTK table will be used, and tableName should be
                                # "default". Also note that the numComp variable is optional.  By
                                # default the number of components is equal to one.  The parameter
                                # numComp must range between (1,4) inclusive; in versions of VTK
                                # prior to vtk2.3 this parameter was not supported.
                                field_name = items[1]
                                dataset.metadata.field_names.append(field_name)
                                try:
                                    num_components = int(items[-1])
                                except Exception:
                                    num_components = 1
                                field_component_indexes = [str(i) for i in range(num_components)]
                                field_components[field_name] = field_component_indexes
                            elif items[0] == 'FIELD':
                                # The dataset consists of CELL_DATA.
                                # FIELD FieldData 2
                                processing_field_section = True
                                num_fields = int(items[-1])
                                fields_processed = []
                            elif processing_field_section:
                                if len(fields_processed) == num_fields:
                                    processing_field_section = False
                                else:
                                    try:
                                        float(items[0])
                                        # Don't process the cell data.
                                        # 0.0123457 0.197531
                                    except Exception:
                                        # Line consists of arrayName numComponents numTuples dataType.
                                        # Example: surface_field1 1 12 double
                                        field_name = items[0]
                                        dataset.metadata.field_names.append(field_name)
                                        num_components = int(items[1])
                                        field_component_indexes = [str(i) for i in range(num_components)]
                                        field_components[field_name] = field_component_indexes
                                        fields_processed.append(field_name)
                        elif line.startswith('CELL_DATA'):
                            # CELL_DATA 3188
                            dataset_structure_complete = True
                            dataset.metadata.cells = int(line.split()[1])
                        elif line.startswith('POINT_DATA'):
                            # POINT_DATA 1876
                            dataset_structure_complete = True
                            dataset.metadata.points = int(line.split()[1])
                        else:
                            dataset, dataset_type = self.set_structure_metadata(line, dataset, dataset_type)
            if len(field_components) > 0:
                dataset.metadata.field_components = field_components
    def set_meta(self, dataset, **kwd):
        if dataset.has_data():
            dataset.metadata.field_names = []
            dataset.metadata.field_components = {}
            dataset_type = None
            field_components = {}
            dataset_structure_complete = False
            processing_field_section = False
            with open(dataset.file_name) as fh:
                for i, line in enumerate(fh):
                    line = line.strip()
                    if not line:
                        continue
                    if i < 3:
                        dataset = self.set_initial_metadata(i, line, dataset)
                    elif dataset.metadata.file_format == 'ASCII' or not util.is_binary(
                            line):
                        if dataset_structure_complete:
                            """
                            The final part of legacy VTK files describes the dataset attributes.
                            This part begins with the keywords POINT_DATA or CELL_DATA, followed
                            by an integer number specifying the number of points or cells,
                            respectively. Other keyword/data combinations then define the actual
                            dataset attribute values (i.e., scalars, vectors, tensors, normals,
                            texture coordinates, or field data).  Dataset attributes are supported
                            for both points and cells.

                            Each type of attribute data has a dataName associated with it. This is
                            a character string (without embedded whitespace) used to identify a
                            particular data.  The dataName is used by the VTK readers to extract
                            data. As a result, more than one attribute data of the same type can be
                            included in a file.  For example, two different scalar fields defined
                            on the dataset points, pressure and temperature, can be contained in
                            the same file.  If the appropriate dataName is not specified in the VTK
                            reader, then the first data of that type is extracted from the file.
                            """
                            items = line.split()
                            if items[0] == 'SCALARS':
                                # Example: SCALARS surface_field double 3
                                # Scalar definition includes specification of a lookup table. The
                                # definition of a lookup table is optional. If not specified, the
                                # default VTK table will be used, and tableName should be
                                # "default". Also note that the numComp variable is optional.  By
                                # default the number of components is equal to one.  The parameter
                                # numComp must range between (1,4) inclusive; in versions of VTK
                                # prior to vtk2.3 this parameter was not supported.
                                field_name = items[1]
                                dataset.metadata.field_names.append(field_name)
                                try:
                                    num_components = int(items[-1])
                                except:
                                    num_components = 1
                                field_component_indexes = [
                                    str(i) for i in range(num_components)
                                ]
                                field_components[
                                    field_name] = field_component_indexes
                            elif items[0] == 'FIELD':
                                # The dataset consists of CELL_DATA.
                                # FIELD FieldData 2
                                processing_field_section = True
                                num_fields = int(items[-1])
                                fields_processed = []
                            elif processing_field_section:
                                if len(fields_processed) == num_fields:
                                    processing_field_section = False
                                else:
                                    try:
                                        float(items[0])
                                        # Don't process the cell data.
                                        # 0.0123457 0.197531
                                    except:
                                        # Line consists of arrayName numComponents numTuples dataType.
                                        # Example: surface_field1 1 12 double
                                        field_name = items[0]
                                        dataset.metadata.field_names.append(
                                            field_name)
                                        num_components = int(items[1])
                                        field_component_indexes = [
                                            str(i)
                                            for i in range(num_components)
                                        ]
                                        field_components[
                                            field_name] = field_component_indexes
                                        fields_processed.append(field_name)
                        elif line.startswith('CELL_DATA'):
                            # CELL_DATA 3188
                            dataset_structure_complete = True
                            dataset.metadata.cells = int(line.split()[1])
                        elif line.startswith('POINT_DATA'):
                            # POINT_DATA 1876
                            dataset_structure_complete = True
                            dataset.metadata.points = int(line.split()[1])
                        else:
                            dataset, dataset_type = self.set_structure_metadata(
                                line, dataset, dataset_type)
            if len(field_components) > 0:
                dataset.metadata.field_components = field_components