def sniff(self, filename): """ Try to guess if the file is a PDB file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('5e5z.pdb') >>> PDB().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> PDB().sniff(fname) False """ headers = get_headers(filename, sep=' ', count=300) h = t = c = s = k = e = False for line in headers: section_name = line[0].strip() if section_name == 'HEADER': h = True elif section_name == 'TITLE': t = True elif section_name == 'COMPND': c = True elif section_name == 'SOURCE': s = True elif section_name == 'KEYWDS': k = True elif section_name == 'EXPDTA': e = True if h * t * c * s * k * e: return True else: return False
def sniff ( self, filename ): """ Determines whether the file is in generic fastq format For details, see http://maq.sourceforge.net/fastq.shtml Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina These differ in the representation of the quality scores >>> fname = get_test_fname( '1.fastqsanger' ) >>> Fastq().sniff( fname ) True >>> fname = get_test_fname( '2.fastqsanger' ) >>> Fastq().sniff( fname ) True """ headers = get_headers( filename, None ) bases_regexp = re.compile( "^[NGTAC]*" ) # check that first block looks like a fastq block try: if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]: # Check the sequence line, make sure it contains only G/C/A/T/N if not bases_regexp.match( headers[1][0] ): return False return True return False except: return False
def sniff(self, filename): """ Try to guess if the file is a PDBQT file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('NuBBE_1_obabel_3D.pdbqt') >>> PDBQT().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> PDBQT().sniff(fname) False """ headers = get_headers(filename, sep=' ', count=300) h = t = c = s = k = False for line in headers: section_name = line[0].strip() if section_name == 'REMARK': h = True elif section_name == 'ROOT': t = True elif section_name == 'ENDROOT': c = True elif section_name == 'BRANCH': s = True elif section_name == 'TORSDOF': k = True if h * t * c * s * k: return True else: return False
def sniff( self, filename ): """ Determines wether the file is in maf format The .maf format is line-oriented. Each multiple alignment ends with a blank line. Each sequence in an alignment is on a single line, which can get quite long, but there is no length limit. Words in a line are delimited by any white space. Lines starting with # are considered to be comments. Lines starting with ## can be ignored by most programs, but contain meta-data of one form or another. The first line of a .maf file begins with ##maf. This word is followed by white-space-separated variable=value pairs. There should be no white space surrounding the "=". For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5 >>> fname = get_test_fname( 'sequence.maf' ) >>> Maf().sniff( fname ) True >>> fname = get_test_fname( 'sequence.fasta' ) >>> Maf().sniff( fname ) False """ headers = get_headers( filename, None ) try: if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf": return True else: return False except: return False
def sniff(self, filename): """ Determines whether the file is a secondary structure map format A single column with an integer value which indicates the row that this row maps to. Check to make sure if structMap[10] = 380 then structMap[380] = 10 and vice versa. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.map' ) >>> SecondaryStructureMap().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.map' ) >>> SecondaryStructureMap().sniff( fname ) False """ headers = get_headers(filename, sep='\t') line_num = 0 rowidxmap = {} for line in headers: line_num += 1 if len(line) > 1: return False try: pointer = int(line[0]) if pointer > line_num: rowidxmap[pointer] = line_num elif pointer > 0 or line_num in rowidxmap: if rowidxmap[line_num] != pointer: return False except (ValueError, KeyError): return False if line_num < 3: return False return True
def sniff( self, filename ): """ Checks for 'pileup-ness' There are two main types of pileup: 6-column and 10-column. For both, the first three and last two columns are the same. We only check the first three to allow for some personalization of the format. >>> fname = get_test_fname( 'interval.interval' ) >>> Pileup().sniff( fname ) False >>> fname = get_test_fname( '6col.pileup' ) >>> Pileup().sniff( fname ) True >>> fname = get_test_fname( '10col.pileup' ) >>> Pileup().sniff( fname ) True """ headers = get_headers( filename, '\t' ) try: for hdr in headers: if hdr and not hdr[0].startswith( '#' ): if len( hdr ) < 3: return False try: # chrom start in column 1 (with 0-based columns) # and reference base is in column 2 check = int( hdr[1] ) assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ] except: return False return True except: return False
def sniff(self, filename): """ Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format The first and second columns have the sequence names and the third column is the distance between those sequences. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' ) >>> PairwiseDistanceMatrix().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' ) >>> PairwiseDistanceMatrix().sniff( fname ) False """ headers = get_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) != 3: return False try: float(line[2]) try: # See if it's also an integer int(line[2]) except ValueError: # At least one value is not an integer all_ints = False except ValueError: return False count += 1 if count > 2: return not all_ints return False
def set_meta(self, dataset, overwrite=True, **kwd): super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd) if dataset.has_data(): label_names = set() otulabel_names = set() ncols = 0 data_lines = 0 comment_lines = 0 headers = get_headers(dataset.file_name, sep='\t', count=-1) # set otulabels if len(headers[0]) > 2: otulabel_names = headers[0][2:] # set label names and number of lines for line in headers: if len(line) >= 2 and not line[0].startswith('@'): data_lines += 1 ncols = max(ncols, len(line)) label_names.add(line[0]) else: comment_lines += 1 # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() dataset.metadata.otulabels = list(otulabel_names) dataset.metadata.otulabels.sort()
def sniff(self, filename): """ http://www.mothur.org/wiki/Oligos_File Determines whether the file is a otu (operational taxonomic unit) format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.oligos' ) >>> Oligos().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.oligos' ) >>> Oligos().sniff( fname ) False """ headers = get_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if len(line) == 2 and line[0] in ['forward', 'reverse']: count += 1 continue elif len(line) == 3 and line[0] == 'barcode': count += 1 continue else: return False if count > 0: return True return False
def sniff(self, filename): """ Determines whether the file is otu (operational taxonomic unit) format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' ) >>> Otu().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' ) >>> Otu().sniff( fname ) False """ headers = get_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 2: return False if count >= 1: try: check = int(line[1]) if check + 2 != len(line): return False except ValueError: return False count += 1 if count > 2: return True return False
def set_meta(self, dataset, overwrite=True, skip=1, **kwd): super(GroupAbund, self).set_meta(dataset, overwrite=overwrite, **kwd) # See if file starts with header line if dataset.has_data(): label_names = set() group_names = set() data_lines = 0 comment_lines = 0 ncols = 0 headers = get_headers(dataset.file_name, sep='\t', count=-1) for line in headers: if line[0] == 'label' and line[1] == 'Group': skip = 1 comment_lines += 1 else: skip = 0 data_lines += 1 ncols = max(ncols, len(line)) label_names.add(line[0]) group_names.add(line[1]) # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() dataset.metadata.groups = list(group_names) dataset.metadata.groups.sort() dataset.metadata.skip = skip
def sniff(self, filename): """ Determines whether the file is otu (operational taxonomic unit) format label<TAB>count[<TAB>value(1..n)] >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.sabund' ) >>> Sabund().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.sabund' ) >>> Sabund().sniff( fname ) False """ headers = get_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 2: return False try: check = int(line[1]) if check + 2 != len(line): return False for i in range(2, len(line)): int(line[i]) except ValueError: return False count += 1 if count > 0: return True return False
def sniff( self, filename ): """ Determines whether the file is in lav format LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ. The first line of a .lav file begins with #:lav. For complete details see http://www.bioperl.org/wiki/LAV_alignment_format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'alignment.lav' ) >>> Lav().sniff( fname ) True >>> fname = get_test_fname( 'alignment.axt' ) >>> Lav().sniff( fname ) False """ headers = get_headers( filename, None ) try: if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'): return True else: return False except: return False
def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd): super(Group, self).set_meta(dataset, overwrite, skip, max_data_lines) group_names = set() headers = get_headers(dataset.file_name, sep='\t', count=-1) for line in headers: if len(line) > 1: group_names.add(line[1]) dataset.metadata.groups = list(group_names)
def sniff( self, filename ): """ InChI files starts with 'InChI=' """ inchi_lines = get_headers( filename, sep=' ', count=10 ) for inchi in inchi_lines: if not inchi[0].startswith('InChI='): return False return True
def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): super(SffFlow, self).set_meta(dataset, overwrite, 1, max_data_lines) headers = get_headers(dataset.file_name, sep='\t', count=1) try: flow_values = int(headers[0][0]) dataset.metadata.flow_values = flow_values except Exception as e: log.warning("SffFlow set_meta %s" % e)
def sniff(self, filename): """ Determines whether the file is an axes format The first line may have column headings. The following lines have the name in the first column plus float columns for each axis. ==> 98_sq_phylip_amazon.fn.unique.pca.axes <== group axis1 axis2 forest 0.000000 0.145743 pasture 0.145743 0.000000 ==> 98_sq_phylip_amazon.nmds.axes <== axis1 axis2 U68589 0.262608 -0.077498 U68590 0.027118 0.195197 U68591 0.329854 0.014395 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.axes' ) >>> Axes().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.axes' ) >>> Axes().sniff( fname ) False """ headers = get_headers(filename, sep='\t') count = 0 col_cnt = None all_integers = True for line in headers: if count != 0: if col_cnt is None: col_cnt = len(line) if col_cnt < 2: return False else: if len(line) != col_cnt: return False try: for i in range(1, col_cnt): check = float(line[i]) # Check abs value is <= 1.0 if abs(check) > 1.0: return False # Also test for whether value is an integer try: check = int(line[i]) except ValueError: all_integers = False except ValueError: return False count += 1 if count > 0: return not all_integers return False
def sniff(self, filename): """Determine if the file is in pdf format.""" headers = get_headers(filename, None, 1) try: if headers[0][0].startswith("%PDF"): return True else: return False except IndexError: return False
def set_meta(self, dataset, overwrite=True, skip=0, **kwd): super(DistanceMatrix, self).set_meta(dataset, overwrite=overwrite, skip=skip, **kwd) headers = get_headers(dataset.file_name, sep='\t') for line in headers: if not line[0].startswith('@'): try: dataset.metadata.sequence_count = int(''.join(line)) # seq count sometimes preceded by tab break except Exception as e: if not isinstance(self, PairwiseDistanceMatrix): log.warning("DistanceMatrix set_meta %s" % e)
def sniff_prefix(self, file_prefix): """ Try to guess if the file is a Gal file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test.gal') >>> Gal().sniff(fname) True >>> fname = get_test_fname('test.gpr') >>> Gal().sniff(fname) False """ headers = get_headers(file_prefix, sep="\t", count=3) return "ATF" in headers[0][0] and "GenePix ArrayList" in headers[2][0]
def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): super(CountTable, self).set_meta(dataset, overwrite=overwrite, **kwd) headers = get_headers(dataset.file_name, sep='\t', count=1) colnames = headers[0] dataset.metadata.column_types = ['str'] + (['int'] * ( len(headers[0]) - 1)) if len(colnames) > 1: dataset.metadata.columns = len(colnames) if len(colnames) > 2: dataset.metadata.groups = colnames[2:] dataset.metadata.comment_lines = 1 dataset.metadata.data_lines -= 1
def sniff(self, filename): """ Determines whether the file is a lower-triangle distance matrix (phylip) format The first line has the number of sequences in the matrix. The remaining lines have the sequence name followed by a list of distances from all preceeding sequences 5 # possibly but not always preceded by a tab :/ U68589 U68590 0.3371 U68591 0.3609 0.3782 U68592 0.4155 0.3197 0.4148 U68593 0.2872 0.1690 0.3361 0.2842 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.lower.dist' ) >>> LowerTriangleDistanceMatrix().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.lower.dist' ) >>> LowerTriangleDistanceMatrix().sniff( fname ) False """ numlines = 300 headers = get_headers(filename, sep='\t', count=numlines) line_num = 0 for line in headers: if not line[0].startswith('@'): # first line should contain the number of sequences in the file if line_num == 0: if len(line) > 2: return False else: try: sequence_count = int(''.join(line)) except ValueError: return False else: # number of fields should equal the line number if len(line) != (line_num): return False try: # Distances should be floats for column in line[2:]: float(column) except ValueError: return False line_num += 1 # check if the number of lines in the file was as expected if line_num == sequence_count + 1 or line_num == numlines + 1: return True return False
def set_meta(self, dataset, overwrite=True, **kwd): """ Set metadata for Otu files. >>> from galaxy.datatypes.sniff import get_test_fname >>> from galaxy.util.bunch import Bunch >>> dataset = Bunch() >>> dataset.metadata = Bunch >>> otu = Otu() >>> dataset.file_name = get_test_fname( 'mothur_datatypetest_true.mothur.otu' ) >>> dataset.has_data = lambda: True >>> otu.set_meta(dataset) >>> dataset.metadata.columns 100 >>> len(dataset.metadata.labels) == 37 True >>> len(dataset.metadata.otulabels) == 98 True """ super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd) if dataset.has_data(): label_names = set() otulabel_names = set() ncols = 0 data_lines = 0 comment_lines = 0 headers = iter_headers(dataset.file_name, sep='\t', count=-1) first_line = get_headers(dataset.file_name, sep='\t', count=1) if first_line: first_line = first_line[0] # set otulabels if len(first_line) > 2: otulabel_names = first_line[2:] # set label names and number of lines for line in headers: if len(line) >= 2 and not line[0].startswith('@'): data_lines += 1 ncols = max(ncols, len(line)) label_names.add(line[0]) else: comment_lines += 1 # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() dataset.metadata.otulabels = list(otulabel_names) dataset.metadata.otulabels.sort()
def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): super(CountTable, self).set_meta(dataset, overwrite=overwrite, **kwd) headers = get_headers(dataset.file_name, sep='\t', count=1) colnames = headers[0] dataset.metadata.column_types = ['str'] + (['int'] * (len(headers[0]) - 1)) if len(colnames) > 1: dataset.metadata.columns = len(colnames) if len(colnames) > 2: dataset.metadata.groups = colnames[2:] dataset.metadata.comment_lines = 1 if isinstance(dataset.metadata.data_lines, int): dataset.metadata.data_lines -= 1
def sniff(self, filename): """ Determines whether the file is a Reference Taxonomy http://www.mothur.org/wiki/Taxonomy_outline A table with 2 or 3 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order) - integer ? Example: 2-column (http://www.mothur.org/wiki/Taxonomy_outline) X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida; AF052717.1 Eukaryota;Parabasalidea; Example: 3-column (http://vamps.mbl.edu/resources/databases.php) v3_AA008 Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus 5 v3_AA016 Bacteria 120 v3_AA019 Archaea;Crenarchaeota;Marine_Group_I 1 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.ref.taxonomy' ) >>> RefTaxonomy().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.ref.taxonomy' ) >>> RefTaxonomy().sniff( fname ) False """ headers = get_headers(filename, sep='\t', count=300) count = 0 pat_prog = re.compile('^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$') found_semicolons = False for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if not (2 <= len(line) <= 3): return False if not pat_prog.match(line[1]): return False if not found_semicolons and line[1].find(';') > -1: found_semicolons = True if len(line) == 3: try: int(line[2]) except Exception: return False count += 1 if count > 0: # Require that at least one entry has semicolons in the 2nd column return found_semicolons return False
def sniff(self, filename): """ Determines whether the file is a square distance matrix (Column-formatted distance matrix) format The first line has the number of sequences in the matrix. The following lines have the sequence name in the first column plus a column for the distance to each sequence in the row order in which they appear in the matrix. 3 U68589 0.0000 0.3371 0.3610 U68590 0.3371 0.0000 0.3783 U68590 0.3371 0.0000 0.3783 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.square.dist' ) >>> SquareDistanceMatrix().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.square.dist' ) >>> SquareDistanceMatrix().sniff( fname ) False """ numlines = 300 headers = get_headers(filename, sep='\t', count=numlines) line_num = 0 for line in headers: if not line[0].startswith('@'): if line_num == 0: if len(line) > 2: return False else: try: sequence_count = int(''.join(line)) except ValueError: return False else: # number of fields should equal the number of sequences if len(line) != sequence_count + 1: return False try: # Distances should be floats for column in line[2:]: float(column) except ValueError: return False line_num += 1 # check if the number of lines in the file was as expected if line_num == sequence_count + 1 or line_num == numlines + 1: return True return False
def sniff(self, filename): """ >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test_tab.bed') >>> PlantTribesKsComponents().sniff(fname) False >>> fname = get_test_fname('1.ptkscmp') >>> PlantTribesKsComponents().sniff(fname) True """ try: line_item_str = get_headers(filename, '\\t', 1)[0][0] return line_item_str == 'species\tn\tnumber_comp\tlnL\tAIC\tBIC\tmean\tvariance\tporportion' except Exception: return False
def set_meta(self, dataset, **kwd): """ Set metadata for Gal file. """ super().set_meta(dataset, **kwd) headers = get_headers(dataset.file_name, sep="\t", count=5) dataset.metadata.file_format = headers[0][0] dataset.metadata.version_number = headers[0][1] dataset.metadata.number_of_optional_header_records = int(headers[1][0]) dataset.metadata.number_of_data_columns = int(headers[1][1]) dataset.metadata.file_type = headers[2][0].strip().strip('"').split("=")[1] if "BlockCount" in headers[3][0]: dataset.metadata.block_count = int(headers[3][0].strip().strip('"').split("=")[1]) if "BlockType" in headers[4][0]: dataset.metadata.block_type = int(headers[4][0].strip().strip('"').split("=")[1])
def set_meta(self, dataset, overwrite=True, skip=0, **kwd): super(DistanceMatrix, self).set_meta(dataset, overwrite=overwrite, skip=skip, **kwd) headers = get_headers(dataset.file_name, sep='\t') for line in headers: if not line[0].startswith('@'): try: dataset.metadata.sequence_count = int( ''.join(line)) # seq count sometimes preceded by tab break except Exception as e: if not isinstance(self, PairwiseDistanceMatrix): log.warning("DistanceMatrix set_meta %s" % e)
def sniff_prefix(self, file_prefix): """ Try to guess if the file is a FPS file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('q.fps') >>> FPS().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> FPS().sniff(fname) False """ header = get_headers(file_prefix, sep='\t', count=1) if header[0][0].strip() == '#FPS1': return True else: return False
def sniff(self, filename): """ Try to guess if the file is a InChI file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('drugbank_drugs.inchi') >>> InChI().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> InChI().sniff(fname) False """ inchi_lines = get_headers(filename, sep=' ', count=10) for inchi in inchi_lines: if not inchi[0].startswith('InChI='): return False return True
def sniff(self, filename): """ Determines whether the file is in axt format axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields. The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly (line 3) with inserts. Repeats are indicated by lower-case letters. For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'alignment.axt' ) >>> Axt().sniff( fname ) True >>> fname = get_test_fname( 'alignment.lav' ) >>> Axt().sniff( fname ) False """ headers = get_headers(filename, None) if len(headers) < 4: return False for hdr in headers: if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"): return True if len(hdr) > 0 and not hdr[0].startswith("#"): if len(hdr) != 9: return False try: map(int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]]) except: return False if hdr[7] not in data.valid_strand: return False else: return True
def sniff( self, filename ): """ Determines whether the file is in axt format axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab at Penn State University. Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines. Blocks are separated from one another by blank lines. The summary line contains chromosomal position and size information about the alignment. It consists of 9 required fields. The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly (line 3) with inserts. Repeats are indicated by lower-case letters. For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'alignment.axt' ) >>> Axt().sniff( fname ) True >>> fname = get_test_fname( 'alignment.lav' ) >>> Axt().sniff( fname ) False """ headers = get_headers( filename, None ) if len(headers) < 4: return False for hdr in headers: if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"): return True if len(hdr) > 0 and not hdr[0].startswith("#"): if len(hdr) != 9: return False try: map( int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]] ) except: return False if hdr[7] not in data.valid_strand: return False else: return True
def sniff( self, filename ): """ Determines whether the file is in html format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'complete.bed' ) >>> Html().sniff( fname ) False >>> fname = get_test_fname( 'file.html' ) >>> Html().sniff( fname ) True """ headers = get_headers( filename, None ) try: for i, hdr in enumerate(headers): if hdr and hdr[0].lower().find( '<html>' ) >= 0: return True return False except: return True
def sniff(self, filename): """ Determines whether the file is in html format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'complete.bed' ) >>> Html().sniff( fname ) False >>> fname = get_test_fname( 'file.html' ) >>> Html().sniff( fname ) True """ headers = get_headers(filename, None) try: for i, hdr in enumerate(headers): if hdr and hdr[0].lower().find('<html>') >= 0: return True return False except: return True
def sniff(self, filename): """ Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.filter' ) >>> LaneMask().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.filter' ) >>> LaneMask().sniff( fname ) False """ headers = get_headers(filename, sep='\t') if len(headers) != 1 or len(headers[0]) != 1: return False if not re.match('^[01]+$', headers[0][0]): return False return True
def sniff(self, filename): """ Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.filter' ) >>> LaneMask().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.filter' ) >>> LaneMask().sniff( fname ) False """ headers = get_headers(filename, sep='\t', count=2) if len(headers) != 1 or len(headers[0]) != 1: return False if not re.match('^[01]+$', headers[0][0]): return False return True
def sniff(self, filename): headers = get_headers(filename, sep=' ', count=300) h = t = c = s = k = e = False for line in headers: section_name = line[0].strip() if section_name == 'HEADER': h = True elif section_name == 'TITLE': t = True elif section_name == 'COMPND': c = True elif section_name == 'SOURCE': s = True elif section_name == 'KEYWDS': k = True elif section_name == 'EXPDTA': e = True if h * t * c * s * k * e == True: return True else: return False
def sniff(self, filename): """ Determines whether the file is a frequency tabular format for chimera analysis #1.14.0 0 0.000 1 0.000 ... 155 0.975 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' ) >>> Frequency().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' ) >>> Frequency().sniff( fname ) False """ headers = get_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if count == 0: # first line should be #<version string> if not line[0].startswith('#') and len(line) == 1: return False else: # all other lines should be <int> <float> if len(line) != 2: return False try: int(line[0]) float(line[1]) except Exception: return False count += 1 if count > 1: return True return False
def sniff(self, filename): """ Determines whether a file is in gdm format GDM files have at least 6 required fields. (Actually in the format definition only the first 5 are mandatory, but the ones returned by the system have always at least 6). Required fields must be tab separated. Columns 0, 3, 4 must be strings. Columns 1, 2, 5 numbers. Column 5 (Score) can be not provided. """ headers = get_headers(filename, '\t', count=10) try: for hdr in headers: if hdr and hdr[0] and not hdr[0].startswith('#'): if len(hdr) != 6: return False try: int(hdr[1]) int(hdr[2]) except: return False if hdr[5] != '.': try: float(hdr[5]) except: return False return True except: return False
def sniff(self, filename, vals_are_int=False): """ Determines whether the file is a otu (operational taxonomic unit) Shared format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.2 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' ) >>> GroupAbund().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' ) >>> GroupAbund().sniff( fname ) False """ headers = get_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 3: return False if count > 0 or line[0] != 'label': try: check = int(line[2]) if check + 3 != len(line): return False for i in range(3, len(line)): if vals_are_int: int(line[i]) else: float(line[i]) except ValueError: return False count += 1 if count > 1: return True return False
def sniff(self, filename): """ Determines whether the file is a quantiles tabular format for chimera analysis 1 0 0 0 0 0 0 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 ... >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' ) >>> Quantile().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' ) >>> Quantile().sniff( fname ) False """ headers = get_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if len(line) != 7: return False try: int(line[0]) float(line[1]) float(line[2]) float(line[3]) float(line[4]) float(line[5]) float(line[6]) except Exception: return False count += 1 if count > 0: return True return False
def sniff( self, filename ): """ Determines whether the file is in lav format LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ. The first line of a .lav file begins with #:lav. For complete details see http://www.bioperl.org/wiki/LAV_alignment_format >>> fname = get_test_fname( 'alignment.lav' ) >>> Lav().sniff( fname ) True >>> fname = get_test_fname( 'alignment.axt' ) >>> Lav().sniff( fname ) False """ headers = get_headers( filename, None ) try: if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'): return True else: return False except: return False
def sniff(self, filename): """ Checks for 'pileup-ness' There are two main types of pileup: 6-column and 10-column. For both, the first three and last two columns are the same. We only check the first three to allow for some personalization of the format. >>> fname = get_test_fname( 'interval.interval' ) >>> Pileup().sniff( fname ) False >>> fname = get_test_fname( '6col.pileup' ) >>> Pileup().sniff( fname ) True >>> fname = get_test_fname( '10col.pileup' ) >>> Pileup().sniff( fname ) True """ headers = get_headers(filename, '\t') try: for hdr in headers: if hdr and not hdr[0].startswith('#'): if len(hdr) < 3: return False try: # chrom start in column 1 (with 0-based columns) # and reference base is in column 2 check = int(hdr[1]) assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ] except: return False return True except: return False
def sniff(self, filename): header = get_headers(filename, sep='\t', count=1) if header[0][0].strip() == '#FPS1': return True else: return False
def sniff(self, filename): headers = get_headers(filename, '\n', count=1) return (len(headers) > 0 and len(headers[0]) >= 7 and headers[0][0] == "gene_id" and headers[0][1].startswith("transcript_id") and headers[0][6] == "FPKM")
def sniff( self, filename ): headers = get_headers( filename, '\n', count=1 ) return headers[0][0].startswith("##fileformat=VCF")