def set_meta( self, dataset, **kwd ): Text.set_meta(self, dataset, **kwd ) data_dir = dataset.extra_files_path ## search data_dir/genome_version for files regulation_pattern = 'regulation_(.+).bin' # annotation files that are included in snpEff by a flag annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'} regulations = [] annotations = [] if data_dir and os.path.isdir(data_dir): for root, dirs, files in os.walk(data_dir): for fname in files: if fname.startswith('snpEffectPredictor'): # if snpEffectPredictor.bin download succeeded genome_version = os.path.basename(root) dataset.metadata.genome_version = genome_version else: m = re.match(regulation_pattern,fname) if m: name = m.groups()[0] regulations.append(name) elif fname in annotations_dict: value = annotations_dict[fname] name = value.lstrip('-') annotations.append(name) dataset.metadata.regulation = regulations dataset.metadata.annotation = annotations
def __init__(self, **kwd): Text.__init__(self, **kwd) self.add_composite_file( "%s.grp", description="Group File", substitute_name_with_metadata="reference_name", is_binary=False ) self.add_composite_file( "%s.ti", description="", substitute_name_with_metadata="reference_name", is_binary=False )
def __init__(self, **kwd): Text.__init__(self, **kwd) self.add_composite_file('%s.gz', description='dbNSFP bgzip', substitute_name_with_metadata='reference_name', is_binary=True) self.add_composite_file('%s.gz.tbi', description='Tabix Index File', substitute_name_with_metadata='reference_name', is_binary=True)
def __init__(self, **kwd): Text.__init__(self, **kwd) self.add_composite_file('%s.grp', description='Group File', substitute_name_with_metadata='reference_name', is_binary=False) self.add_composite_file('%s.ti', description='', substitute_name_with_metadata='reference_name', is_binary=False)
def set_meta(self, dataset, **kwd): Text.set_meta(self, dataset, **kwd) data_dir = dataset.extra_files_path # search data_dir/genome_version for files regulation_pattern = 'regulation_(.+).bin' # annotation files that are included in snpEff by a flag annotations_dict = { 'nextProt.bin': '-nextprot', 'motif.bin': '-motif', 'interactions.bin': '-interaction' } regulations = [] annotations = [] genome_version = None snpeff_version = None if data_dir and os.path.isdir(data_dir): for root, dirs, files in os.walk(data_dir): for fname in files: if fname.startswith('snpEffectPredictor'): # if snpEffectPredictor.bin download succeeded genome_version = os.path.basename(root) dataset.metadata.genome_version = genome_version # read the first line of the gzipped snpEffectPredictor.bin file to get the SnpEff version snpeff_version = self.getSnpeffVersionFromFile( os.path.join(root, fname)) if snpeff_version: dataset.metadata.snpeff_version = snpeff_version else: m = re.match(regulation_pattern, fname) if m: name = m.groups()[0] regulations.append(name) elif fname in annotations_dict: value = annotations_dict[fname] name = value.lstrip('-') annotations.append(name) dataset.metadata.regulation = regulations dataset.metadata.annotation = annotations try: with open(dataset.file_name, 'w') as fh: fh.write( "%s\n" % genome_version if genome_version else 'Genome unknown') fh.write("%s\n" % snpeff_version if snpeff_version else 'SnpEff version unknown') if annotations: fh.write("annotations: %s\n" % ','.join(annotations)) if regulations: fh.write("regulations: %s\n" % ','.join(regulations)) except Exception: pass
def merge(split_files, output_file): """Merging multiple MIRA files is non-trivial and may not be possible...""" if len(split_files) == 1: #For one file only, use base class method (move/copy) return Text.merge(split_files, output_file) if not split_files: raise ValueError("Given no MIRA, %r, to merge into %s" \ % (split_files, output_file)) raise NotImplementedError("Merging MIRA Assembly Files has not been implemented")
def set_meta( self, dataset, **kwd ): Text.set_meta(self, dataset, **kwd ) data_dir = dataset.extra_files_path # search data_dir/genome_version for files regulation_pattern = 'regulation_(.+).bin' # annotation files that are included in snpEff by a flag annotations_dict = {'nextProt.bin' : '-nextprot', 'motif.bin': '-motif'} regulations = [] annotations = [] genome_version = None snpeff_version = None if data_dir and os.path.isdir(data_dir): for root, dirs, files in os.walk(data_dir): for fname in files: if fname.startswith('snpEffectPredictor'): # if snpEffectPredictor.bin download succeeded genome_version = os.path.basename(root) dataset.metadata.genome_version = genome_version # read the first line of the gzipped snpEffectPredictor.bin file to get the SnpEff version snpeff_version = self.getSnpeffVersionFromFile(os.path.join(root, fname)) if snpeff_version: dataset.metadata.snpeff_version = snpeff_version else: m = re.match(regulation_pattern, fname) if m: name = m.groups()[0] regulations.append(name) elif fname in annotations_dict: value = annotations_dict[fname] name = value.lstrip('-') annotations.append(name) dataset.metadata.regulation = regulations dataset.metadata.annotation = annotations try: fh = file(dataset.file_name, 'w') fh.write("%s\n" % genome_version if genome_version else 'Genome unknown') fh.write("%s\n" % snpeff_version if snpeff_version else 'SnpEff version unknown') if annotations: fh.write("annotations: %s\n" % ','.join(annotations)) if regulations: fh.write("regulations: %s\n" % ','.join(regulations)) fh.close() except: pass
def merge(split_files, output_file): """Merging multiple MIRA files is non-trivial and may not be possible...""" if len(split_files) == 1: #For one file only, use base class method (move/copy) return Text.merge(split_files, output_file) if not split_files: raise ValueError("Given no MIRA, %r, to merge into %s" \ % (split_files, output_file)) raise NotImplementedError( "Merging MIRA Assembly Files has not been implemented")
def set_meta(self, dataset, **kwd): Text.set_meta(self, dataset, **kwd) data_dir = dataset.extra_files_path ## search data_dir/genome_version for files regulation_pattern = "regulation_(.+).bin" # annotation files that are included in snpEff by a flag annotations_dict = {"nextProt.bin": "-nextprot", "motif.bin": "-motif"} regulations = [] annotations = [] if data_dir and os.path.isdir(data_dir): for root, dirs, files in os.walk(data_dir): for fname in files: if fname.startswith("snpEffectPredictor"): # if snpEffectPredictor.bin download succeeded genome_version = os.path.basename(root) dataset.metadata.genome_version = genome_version else: m = re.match(regulation_pattern, fname) if m: name = m.groups()[0] regulations.append(name) elif fname in annotations_dict: value = annotations_dict[fname] name = value.lstrip("-") annotations.append(name) dataset.metadata.regulation = regulations dataset.metadata.annotation = annotations try: fh = file(dataset.file_name, "w") fh.write("%s\n" % genome_version) if annotations: fh.write("annotations: %s\n" % ",".join(annotations)) if regulations: fh.write("regulations: %s\n" % ",".join(regulations)) fh.close() except: pass
def merge(split_files, output_file): """ Merging fps files requires merging the header manually. We take the header from the first file. """ if len(split_files) == 1: # For one file only, use base class method (move/copy) return Text.merge(split_files, output_file) if not split_files: raise ValueError("No fps files given, %r, to merge into %s" % (split_files, output_file)) with open(output_file, "w") as out: first = True for filename in split_files: with open(filename) as handle: for line in handle: if line.startswith('#'): if first: out.write(line) else: # line is no header and not a comment, we assume the first header is written to out and we set 'first' to False first = False out.write(line)
def merge(split_files, output_file): """ Merging CML files. """ if len(split_files) == 1: # For one file only, use base class method (move/copy) return Text.merge(split_files, output_file) if not split_files: raise ValueError("Given no CML files, %r, to merge into %s" % (split_files, output_file)) with open(output_file, "w") as out: for filename in split_files: with open(filename) as handle: header = handle.readline() if not header: raise ValueError("CML file %s was empty" % filename) if not header.lstrip().startswith('<?xml version="1.0"?>'): out.write(header) raise ValueError("%s is not a valid XML file!" % filename) line = handle.readline() header += line if not line.lstrip().startswith( '<cml xmlns="http://www.xml-cml.org/schema'): out.write(header) raise ValueError("%s is not a CML file!" % filename) molecule_found = False for line in handle.readlines(): # We found two required header lines, the next line should start with <molecule > if line.lstrip().startswith('</cml>'): continue if line.lstrip().startswith('<molecule'): molecule_found = True if molecule_found: out.write(line) out.write("</cml>\n")
def init_meta(self, dataset, copy_from=None): Text.init_meta(self, dataset, copy_from=copy_from)
def merge(split_files, output_file): """Merging multiple XML files is non-trivial and must be done in subclasses.""" if len(split_files) == 1: #For one file only, use base class method (move/copy) return Text.merge(split_files, output_file) if not split_files: raise ValueError("Given no BLAST XML files, %r, to merge into %s" \ % (split_files, output_file)) out = open(output_file, "w") h = None for f in split_files: if not os.path.isfile(f): log.warning("BLAST XML file %s missing, retry in 1s..." % f) sleep(1) if not os.path.isfile(f): log.error("BLAST XML file %s missing" % f) raise ValueError("BLAST XML file %s missing" % f) h = open(f) header = h.readline() if not header: out.close() h.close() #Retry, could be transient error with networked file system... log.warning("BLAST XML file %s empty, retry in 1s..." % f) sleep(1) h = open(f) header = h.readline() if not header: log.error("BLAST XML file %s was empty" % f) raise ValueError("BLAST XML file %s was empty" % f) if header.strip() != '<?xml version="1.0"?>': out.write(header) #for diagnosis out.close() h.close() raise ValueError("%s is not an XML file!" % f) line = h.readline() header += line if line.strip() not in [ '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">' ]: out.write(header) #for diagnosis out.close() h.close() raise ValueError("%s is not a BLAST XML file!" % f) while True: line = h.readline() if not line: out.write(header) #for diagnosis out.close() h.close() raise ValueError("BLAST XML file %s ended prematurely" % f) header += line if "<Iteration>" in line: break if len(header) > 10000: #Something has gone wrong, don't load too much into memory! #Write what we have to the merged file for diagnostics out.write(header) out.close() h.close() raise ValueError( "BLAST XML file %s has too long a header!" % f) if "<BlastOutput>" not in header: out.close() h.close() raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header)) if f == split_files[0]: out.write(header) old_header = header elif old_header[:300] != header[:300]: #Enough to check <BlastOutput_program> and <BlastOutput_version> match out.close() h.close() raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \ % (split_files[0], f, old_header[:300], header[:300])) else: out.write(" <Iteration>\n") for line in h: if "</BlastOutput_iterations>" in line: break #TODO - Increment <Iteration_iter-num> and if required automatic query names #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? out.write(line) h.close() out.write(" </BlastOutput_iterations>\n") out.write("</BlastOutput>\n") out.close()
def __init__(self, **kwd): Text.__init__(self, **kwd) self.max_lines = 10
def __init__( self, **kwd ): Text.__init__( self, **kwd ) self.add_composite_file( '%s.grp', description = 'Group File', substitute_name_with_metadata = 'reference_name', is_binary = False ) self.add_composite_file( '%s.ti', description = '', substitute_name_with_metadata = 'reference_name', is_binary = False )
def init_meta( self, dataset, copy_from=None ): Text.init_meta( self, dataset, copy_from=copy_from )
def __init__( self, **kwd ): Text.__init__( self, **kwd )
def merge(split_files, output_file): """Merging multiple XML files is non-trivial and must be done in subclasses.""" if len(split_files) == 1: #For one file only, use base class method (move/copy) return Text.merge(split_files, output_file) if not split_files: raise ValueError("Given no BLAST XML files, %r, to merge into %s" \ % (split_files, output_file)) out = open(output_file, "w") h = None for f in split_files: if not os.path.isfile(f): log.warning("BLAST XML file %s missing, retry in 1s..." % f) sleep(1) if not os.path.isfile(f): log.error("BLAST XML file %s missing" % f) raise ValueError("BLAST XML file %s missing" % f) h = open(f) body = False header = h.readline() if not header: out.close() h.close() #Retry, could be transient error with networked file system... log.warning("BLAST XML file %s empty, retry in 1s..." % f) sleep(1) h = open(f) header = h.readline() if not header: log.error("BLAST XML file %s was empty" % f) raise ValueError("BLAST XML file %s was empty" % f) if header.strip() != '<?xml version="1.0"?>': out.write(header) #for diagnosis out.close() h.close() raise ValueError("%s is not an XML file!" % f) line = h.readline() header += line if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: out.write(header) #for diagnosis out.close() h.close() raise ValueError("%s is not a BLAST XML file!" % f) while True: line = h.readline() if not line: out.write(header) #for diagnosis out.close() h.close() raise ValueError("BLAST XML file %s ended prematurely" % f) header += line if "<Iteration>" in line: break if len(header) > 10000: #Something has gone wrong, don't load too much into memory! #Write what we have to the merged file for diagnostics out.write(header) out.close() h.close() raise ValueError("BLAST XML file %s has too long a header!" % f) if "<BlastOutput>" not in header: out.close() h.close() raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header)) if f == split_files[0]: out.write(header) old_header = header elif old_header[:300] != header[:300]: #Enough to check <BlastOutput_program> and <BlastOutput_version> match out.close() h.close() raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \ % (split_files[0], f, old_header[:300], header[:300])) else: out.write(" <Iteration>\n") for line in h: if "</BlastOutput_iterations>" in line: break #TODO - Increment <Iteration_iter-num> and if required automatic query names #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? out.write(line) h.close() out.write(" </BlastOutput_iterations>\n") out.write("</BlastOutput>\n") out.close()
def __init__( self, **kwd ): Text.__init__( self, **kwd ) self.add_composite_file( '%s.gz', description='dbNSFP bgzip', substitute_name_with_metadata='reference_name', is_binary=True ) self.add_composite_file( '%s.gz.tbi', description='Tabix Index File', substitute_name_with_metadata='reference_name', is_binary=True )
def __init__(self, **kwd): Text.__init__(self, **kwd)