def init_meta( self, dataset, copy_from=None ): Tabular.init_meta( self, dataset, copy_from=copy_from ) self.column_names = ['DB', 'DB Object ID', 'DB Object Symbol', 'Qualifier', 'GO ID', 'DB:Reference (|DB:Reference)', 'Evidence Code', 'With (or) From', 'Aspect', 'DB Object', 'DB Object Synonym (|Synonym)', 'DB Object Type', 'Taxon(|taxon)', 'Date', 'Assigned By', 'Annotation Extension', 'Gene Product Form ID', ]
def set_meta( self, dataset, overwrite = True, **kwd ): """Sets the metadata information for datasets previously determined to be in bed format.""" i = 0 if dataset.has_data(): for i, line in enumerate( file(dataset.file_name) ): metadata_set = False line = line.rstrip('\r\n') if line and not line.startswith('#'): elems = line.split('\t') if len(elems) > 2: for startswith in data.col1_startswith: if line.lower().startswith( startswith ): if len( elems ) > 3: if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): dataset.metadata.nameCol = 4 if len(elems) < 6: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 0 else: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 6 metadata_set = True break if metadata_set: break Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
def set_meta( self, dataset, overwrite = True, **kwd ): i = 0 for i, line in enumerate( file ( dataset.file_name ) ): line = line.rstrip('\r\n') if line and not line.startswith( '#' ): elems = line.split( '\t' ) valid_start = False valid_end = False if len( elems ) == 9: try: start = int( elems[3] ) valid_start = True except: if elems[3] == '.': valid_start = True try: end = int( elems[4] ) valid_end = True except: if elems[4] == '.': valid_end = True strand = elems[6] phase = elems[7] if valid_start and valid_end and start < end and strand in self.valid_gff3_strand and phase in self.valid_gff3_phase: break Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
def __init__(self, **kwd): """Initialize CG_Var datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['locus', 'ploidy', 'allele', 'chromosome', 'begin', 'end', 'varType', 'reference', 'alleleSeq', 'varScoreVAF', 'varScoreEAF', 'varQuality', 'hapLink', 'xRef' ]
def __init__(self, **kwd): """ Initialize gg datatype, by adding UCSC display apps """ Tabular.__init__(self, **kwd) self.add_display_app('ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links')
def __init__(self, **kwd): Tabular.__init__(self, **kwd) self.column_names = [ 'Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility' ]
def __init__(self, **kwd): Tabular.__init__(self, **kwd) self.column_names = [ 'ID', 'local_max', 'min_count_aTIS', 'R_aTis', 'min_count_5UTR', 'R_5UTR', 'min_count_CDS', 'R_CDS', 'min_count_3UTR', 'R_3UTR', 'min_count_no_trans', 'R_no_trans', 'SNP' ] self.columns = 13
def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, **kwd) dataset.metadata.markerCol = 1 header = open(dataset.file_name, 'r').readlines()[0].strip().split('\t') dataset.metadata.columns = len(header) t = ['numeric' for x in header] t[0] = 'string' dataset.metadata.column_types = t return True
def set_meta(self, dataset, **kwd): Tabular.set_meta( self, dataset, **kwd) dataset.metadata.markerCol = 1 header = file(dataset.file_name, 'r').readlines()[0].strip().split('\t') dataset.metadata.columns = len(header) t = ['numeric' for x in header] t[0] = 'string' dataset.metadata.column_types = t return True
def __init__(self, **kwd): """Initialize CG_Gene datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['index', 'locus', 'allele', 'chromosome', 'begin', 'end', 'varType', 'reference', 'call', 'xRef', 'geneId', 'mrnaAcc', 'proteinAcc', 'symbol', 'orientation', 'component', 'componentIndex', 'hasCodingRegion', 'impact', 'nucleotidePos', 'proteinPos', 'annotationRefSequence', 'sampleSequence', 'genomeRefSequence', 'pfam' ]
def set_meta(self, dataset, overwrite=True, **kwd): Tabular.set_meta(self, dataset, overwrite=overwrite, max_data_lines=None, max_guess_type_data_lines=1000, **kwd) if dataset.metadata.comment_metadata is None: dataset_comment_metadata = DatasetCommentMetadata(dataset) dataset.metadata.comment_metadata = dataset_comment_metadata.comment_metadata.copy( ) self.set_dataset_metadata_from_comments(dataset)
def __init__(self, **kwd): Tabular.__init__(self, **kwd) self.column_names = [ "Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
def set_meta( self, dataset, overwrite = True, **kwd ): i = 0 for i, line in enumerate( file ( dataset.file_name ) ): line = line.rstrip('\r\n') if line and not line.startswith( '#' ): elems = line.split( '\t' ) if len(elems) == 9: try: int( elems[3] ) int( elems[4] ) break except: pass Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
def __init__(self, **kwd): """Initialize CG_MasterVar datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['locus', 'ploidy', 'chromosome', 'begin', 'end', 'zygosity', 'varType', 'reference', 'allele1Seq', 'allele2Seq', 'allele1VarScoreVAF', 'allele2VarScoreVAF', 'allele1VarScoreEAF', 'allele2VarScoreEAF', 'allele1VarQuality', 'allele2VarQuality', 'allele1HapLink', 'allele2HapLink', 'allele1XRef', 'allele2XRef', 'evidenceIntervalId', 'allele1ReadCount', 'allele2ReadCount', 'referenceAlleleRead', 'totalReadCount', 'allele1Gene', 'allele2Gene pfam', 'miRBaseId', 'repeatMasker', 'segDupOverlap', 'relativeCoverageDiploid', 'calledPloidy', 'relativeCoverageNondiploid', 'calledLevel' ]
def test_tabular_set_meta_large_file(): with tempfile.NamedTemporaryFile(mode='w') as test_file: for _ in range(MAX_DATA_LINES + 1): test_file.write("A\tB\n") test_file.flush() dataset = MockDataset(id=1) dataset.file_name = test_file.name Tabular().set_meta(dataset)
def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd): Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) tis_args = set() try: fh = open(dataset.file_name) for line in fh: fields = line.strip().split("\t") try: tis_args.add(fields[0]) except IndexError: pass dataset.metadata.args = [] dataset.metadata.args += tis_args finally: fh.close()
def __init__(self, **kwd): Tabular.__init__(self, **kwd) self.column_names = [ "ID", "local_max", "min_count_aTIS", "R_aTis", "min_count_5UTR", "R_5UTR", "min_count_CDS", "R_CDS", "min_count_3UTR", "R_3UTR", "min_count_no_trans", "R_no_trans", "SNP", ] self.columns = 13
def set_meta( self, dataset, overwrite = True, **kwd ): i = 0 for i, line in enumerate( file ( dataset.file_name ) ): line = line.rstrip('\r\n') if line and not line.startswith( '#' ): elems = line.split( '\t' ) try: float( elems[0] ) #"Wiggle track data values can be integer or real, positive or negative values" break except: do_break = False for str in data.col1_startswith: if elems[0].lower().startswith(str): do_break = True break if do_break: break Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
def init_meta(self, dataset, copy_from=None): Tabular.init_meta(self, dataset, copy_from=copy_from) self.column_names = [ 'DB', 'DB Object ID', 'DB Object Symbol', 'Qualifier', 'GO ID', 'DB:Reference (|DB:Reference)', 'Evidence Code', 'With (or) From', 'Aspect', 'DB Object', 'DB Object Synonym (|Synonym)', 'DB Object Type', 'Taxon(|taxon)', 'Date', 'Assigned By', 'Annotation Extension', 'Gene Product Form ID', ]
def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd): Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) tis_args = set() try: fh = open(dataset.file_name) for line in fh: fields = line.strip().split('\t') try: tis_args.add(fields[0]) except IndexError: pass dataset.metadata.args = [] dataset.metadata.args += tis_args finally: fh.close()
def __init__(self, **kwd): Tabular.__init__(self, **kwd) """Initialize RsemResults datatype""" self.comment_lines = 1
def set_peek(self, dataset): Tabular.set_peek(self, dataset) if not dataset.dataset.purged: dataset.blurb += f' dim: {str(dataset.metadata.dimension)}'
def set_meta( self, dataset, overwrite = True, **kwd ): Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
def set_meta( self, dataset, **kwd ): Tabular.set_meta( self, dataset, **kwd )
def __init__(self, **kwd): """Initialize interval datatype, by adding UCSC display app""" Tabular.__init__(self, **kwd) self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
def __init__(self, **kwd): """Initialize datatype, by adding GBrowse display app""" Tabular.__init__(self, **kwd) self.add_display_app ('c_elegans', 'display in Wormbase', 'as_gbrowse_display_file', 'gbrowse_links' )
def __init__(self, **kwd): """ Initialize featurelistt datatype """ Tabular.__init__(self, **kwd) self.column_names = []
def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ): Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 ) """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand""" if dataset.has_data(): for i, line in enumerate( file( dataset.file_name ) ): line = line.rstrip( '\r\n' ) if line: if ( first_line_is_header or line[0] == '#' ): self.init_meta( dataset ) line = line.strip( '#' ) elems = line.split( '\t' ) valid = dict( alias_helper ) # shrinks for index, col_name in enumerate( elems ): if col_name in valid: meta_name = valid[col_name] if overwrite or not dataset.metadata.element_is_set( meta_name ): setattr( dataset.metadata, meta_name, index+1 ) values = alias_spec[ meta_name ] start = values.index( col_name ) for lower in values[ start: ]: del valid[ lower ] # removes lower priority keys break # Our metadata is set, so break out of the outer loop else: # Header lines in Interval files are optional. For example, BED is Interval but has no header. # We'll make a best guess at the location of the metadata columns. metadata_is_set = False elems = line.split( '\t' ) if len( elems ) > 2: for str in data.col1_startswith: if line.lower().startswith( str ): if overwrite or not dataset.metadata.element_is_set( 'chromCol' ): dataset.metadata.chromCol = 1 try: int( elems[1] ) if overwrite or not dataset.metadata.element_is_set( 'startCol' ): dataset.metadata.startCol = 2 except: pass # Metadata default will be used try: int( elems[2] ) if overwrite or not dataset.metadata.element_is_set( 'endCol' ): dataset.metadata.endCol = 3 except: pass # Metadata default will be used if len( elems ) > 3: try: int( elems[3] ) except: if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): dataset.metadata.nameCol = 4 if len( elems ) < 6 or elems[5] not in data.valid_strand: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 0 else: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 6 metadata_is_set = True break if metadata_is_set: break # Our metadata is set, so break out of the outer loop
def init_meta(self, dataset, copy_from=None): Tabular.init_meta(self, dataset, copy_from=copy_from)
def display_peek(self, dataset): return Tabular.make_html_table(self, dataset, column_names=self.column_names)
def display_peek(self, dataset): """Returns formated html of peek""" return Tabular.make_html_table(self, dataset, column_names=self.column_names)
def __init__( self, **kwd ): Tabular.__init__( self, **kwd ) self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' )
def __init__(self, **kwd): Tabular.__init__( self, **kwd ) self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
def init_meta( self, dataset, copy_from=None ): Tabular.init_meta( self, dataset, copy_from=copy_from )
def display_peek( self, dataset ): return Tabular.make_html_table( self, dataset, column_names=self.column_names )
def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, **kwd) if dataset.has_data(): with open(dataset.file_name, errors='ignore') as fh: dataset.metadata.dimension = self._get_dimension(fh)
def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, skip=None, **kwd)
def set_meta( self, dataset, overwrite = True, **kwd ): Tabular.set_meta( self, dataset, overwrite=overwrite, max_data_lines=None, max_guess_type_data_lines=1000, **kwd ) if dataset.metadata.comment_metadata is None: dataset_comment_metadata = DatasetCommentMetadata( dataset ) dataset.metadata.comment_metadata = dataset_comment_metadata.comment_metadata.copy() self.set_dataset_metadata_from_comments( dataset )
def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ): Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 ) """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand""" if dataset.has_data(): empty_line_count = 0 num_check_lines = 100 # only check up to this many non empty lines for i, line in enumerate( file( dataset.file_name ) ): line = line.rstrip( '\r\n' ) if line: if ( first_line_is_header or line[0] == '#' ): self.init_meta( dataset ) line = line.strip( '#' ) elems = line.split( '\t' ) valid = dict( alias_helper ) # shrinks for index, col_name in enumerate( elems ): if col_name in valid: meta_name = valid[col_name] if overwrite or not dataset.metadata.element_is_set( meta_name ): setattr( dataset.metadata, meta_name, index+1 ) values = alias_spec[ meta_name ] start = values.index( col_name ) for lower in values[ start: ]: del valid[ lower ] # removes lower priority keys break # Our metadata is set, so break out of the outer loop else: # Header lines in Interval files are optional. For example, BED is Interval but has no header. # We'll make a best guess at the location of the metadata columns. metadata_is_set = False elems = line.split( '\t' ) if len( elems ) > 2: for str in data.col1_startswith: if line.lower().startswith( str ): if overwrite or not dataset.metadata.element_is_set( 'chromCol' ): dataset.metadata.chromCol = 1 try: int( elems[1] ) if overwrite or not dataset.metadata.element_is_set( 'startCol' ): dataset.metadata.startCol = 2 except: pass # Metadata default will be used try: int( elems[2] ) if overwrite or not dataset.metadata.element_is_set( 'endCol' ): dataset.metadata.endCol = 3 except: pass # Metadata default will be used #we no longer want to guess that this column is the 'name', name must now be set manually for interval files #we will still guess at the strand, as we can make a more educated guess #if len( elems ) > 3: # try: # int( elems[3] ) # except: # if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): # dataset.metadata.nameCol = 4 if len( elems ) < 6 or elems[5] not in data.valid_strand: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 0 else: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 6 metadata_is_set = True break if metadata_is_set or ( i - empty_line_count ) > num_check_lines: break # Our metadata is set or we examined 100 non-empty lines, so break out of the outer loop else: empty_line_count += 1
def display_peek( self, dataset ): """Returns formated html of peek""" return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] )
def __init__(self, **kwd): Tabular.__init__( self, **kwd ) self.column_names = ['ID','local_max','min_count_aTIS','R_aTis','min_count_5UTR','R_5UTR','min_count_CDS','R_CDS','min_count_3UTR','R_3UTR','min_count_no_trans','R_no_trans'] self.columns = 12
def set_readonly_meta( self, dataset, skip=1, **kwd ): """Resets the values of readonly metadata elements.""" Tabular.set_readonly_meta( self, dataset, skip = skip, **kwd )
def display_peek( self, dataset ): """Returns formated html of peek""" return Tabular.make_html_table( self, dataset, column_names=self.column_names )
def make_html_table( self, dataset ): return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] )
def __init__(self, **kwd): Tabular.__init__(self, **kwd)
def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, **kwd)
def display_peek(self, dataset): """Returns formated html of peek""" return Tabular.make_html_table(self, dataset, skipchars=['#'])
def __init__(self, **kwd): Tabular.__init__(self, **kwd) self.column_names = ['Column', 'Name', 'Alias']
def __init__(self, **kwd): """Initialize QTLMap:Genealogy datatype""" Tabular.__init__(self, **kwd)