Example #1
0
 def merge(split_files, output_file):
     """Merging multiple MIRA files is non-trivial and may not be possible..."""
     if len(split_files) == 1:
         #For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no MIRA, %r, to merge into %s" \
                          % (split_files, output_file))
     raise NotImplementedError("Merging MIRA Assembly Files has not been implemented")
Example #2
0
 def merge(split_files, output_file):
     """Merging multiple MIRA files is non-trivial and may not be possible..."""
     if len(split_files) == 1:
         #For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no MIRA, %r, to merge into %s" \
                          % (split_files, output_file))
     raise NotImplementedError(
         "Merging MIRA Assembly Files has not been implemented")
Example #3
0
 def merge(split_files, output_file):
     """
     Merging fps files requires merging the header manually.
     We take the header from the first file.
     """
     if len(split_files) == 1:
         # For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("No fps files given, %r, to merge into %s" %
                          (split_files, output_file))
     with open(output_file, "w") as out:
         first = True
         for filename in split_files:
             with open(filename) as handle:
                 for line in handle:
                     if line.startswith('#'):
                         if first:
                             out.write(line)
                     else:
                         # line is no header and not a comment, we assume the first header is written to out and we set 'first' to False
                         first = False
                         out.write(line)
Example #4
0
 def merge(split_files, output_file):
     """
     Merging CML files.
     """
     if len(split_files) == 1:
         # For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no CML files, %r, to merge into %s" %
                          (split_files, output_file))
     with open(output_file, "w") as out:
         for filename in split_files:
             with open(filename) as handle:
                 header = handle.readline()
                 if not header:
                     raise ValueError("CML file %s was empty" % filename)
                 if not header.lstrip().startswith('<?xml version="1.0"?>'):
                     out.write(header)
                     raise ValueError("%s is not a valid XML file!" %
                                      filename)
                 line = handle.readline()
                 header += line
                 if not line.lstrip().startswith(
                         '<cml xmlns="http://www.xml-cml.org/schema'):
                     out.write(header)
                     raise ValueError("%s is not a CML file!" % filename)
                 molecule_found = False
                 for line in handle.readlines():
                     # We found two required header lines, the next line should start with <molecule >
                     if line.lstrip().startswith('</cml>'):
                         continue
                     if line.lstrip().startswith('<molecule'):
                         molecule_found = True
                     if molecule_found:
                         out.write(line)
         out.write("</cml>\n")
Example #5
0
 def merge(split_files, output_file):
     """Merging multiple XML files is non-trivial and must be done in subclasses."""
     if len(split_files) == 1:
         #For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no BLAST XML files, %r, to merge into %s" \
                          % (split_files, output_file))
     out = open(output_file, "w")
     h = None
     for f in split_files:
         if not os.path.isfile(f):
             log.warning("BLAST XML file %s missing, retry in 1s..." % f)
             sleep(1)
         if not os.path.isfile(f):
             log.error("BLAST XML file %s missing" % f)
             raise ValueError("BLAST XML file %s missing" % f)
         h = open(f)
         header = h.readline()
         if not header:
             out.close()
             h.close()
             #Retry, could be transient error with networked file system...
             log.warning("BLAST XML file %s empty, retry in 1s..." % f)
             sleep(1)
             h = open(f)
             header = h.readline()
             if not header:
                 log.error("BLAST XML file %s was empty" % f)
                 raise ValueError("BLAST XML file %s was empty" % f)
         if header.strip() != '<?xml version="1.0"?>':
             out.write(header)  #for diagnosis
             out.close()
             h.close()
             raise ValueError("%s is not an XML file!" % f)
         line = h.readline()
         header += line
         if line.strip() not in [
                 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
                 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">'
         ]:
             out.write(header)  #for diagnosis
             out.close()
             h.close()
             raise ValueError("%s is not a BLAST XML file!" % f)
         while True:
             line = h.readline()
             if not line:
                 out.write(header)  #for diagnosis
                 out.close()
                 h.close()
                 raise ValueError("BLAST XML file %s ended prematurely" % f)
             header += line
             if "<Iteration>" in line:
                 break
             if len(header) > 10000:
                 #Something has gone wrong, don't load too much into memory!
                 #Write what we have to the merged file for diagnostics
                 out.write(header)
                 out.close()
                 h.close()
                 raise ValueError(
                     "BLAST XML file %s has too long a header!" % f)
         if "<BlastOutput>" not in header:
             out.close()
             h.close()
             raise ValueError("%s is not a BLAST XML file:\n%s\n..." %
                              (f, header))
         if f == split_files[0]:
             out.write(header)
             old_header = header
         elif old_header[:300] != header[:300]:
             #Enough to check <BlastOutput_program> and <BlastOutput_version> match
             out.close()
             h.close()
             raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \
                              % (split_files[0], f, old_header[:300], header[:300]))
         else:
             out.write("    <Iteration>\n")
         for line in h:
             if "</BlastOutput_iterations>" in line:
                 break
             #TODO - Increment <Iteration_iter-num> and if required automatic query names
             #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
             out.write(line)
         h.close()
     out.write("  </BlastOutput_iterations>\n")
     out.write("</BlastOutput>\n")
     out.close()
Example #6
0
 def merge(split_files, output_file):
     """Merging multiple XML files is non-trivial and must be done in subclasses."""
     if len(split_files) == 1:
         #For one file only, use base class method (move/copy)
         return Text.merge(split_files, output_file)
     if not split_files:
         raise ValueError("Given no BLAST XML files, %r, to merge into %s" \
                          % (split_files, output_file))
     out = open(output_file, "w")
     h = None
     for f in split_files:
         if not os.path.isfile(f):
             log.warning("BLAST XML file %s missing, retry in 1s..." % f)
             sleep(1)
         if not os.path.isfile(f):
             log.error("BLAST XML file %s missing" % f)
             raise ValueError("BLAST XML file %s missing" % f)
         h = open(f)
         body = False
         header = h.readline()
         if not header:
             out.close()
             h.close()
             #Retry, could be transient error with networked file system...
             log.warning("BLAST XML file %s empty, retry in 1s..." % f)
             sleep(1)
             h = open(f)
             header = h.readline()
             if not header:
                 log.error("BLAST XML file %s was empty" % f)
                 raise ValueError("BLAST XML file %s was empty" % f)
         if header.strip() != '<?xml version="1.0"?>':
             out.write(header) #for diagnosis
             out.close()
             h.close()
             raise ValueError("%s is not an XML file!" % f)
         line = h.readline()
         header += line
         if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
                                 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']:
             out.write(header) #for diagnosis
             out.close()
             h.close()
             raise ValueError("%s is not a BLAST XML file!" % f)
         while True:
             line = h.readline()
             if not line:
                 out.write(header) #for diagnosis
                 out.close()
                 h.close()
                 raise ValueError("BLAST XML file %s ended prematurely" % f)
             header += line
             if "<Iteration>" in line:
                 break
             if len(header) > 10000:
                 #Something has gone wrong, don't load too much into memory!
                 #Write what we have to the merged file for diagnostics
                 out.write(header)
                 out.close()
                 h.close()
                 raise ValueError("BLAST XML file %s has too long a header!" % f)
         if "<BlastOutput>" not in header:
             out.close()
             h.close()
             raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header))
         if f == split_files[0]:
             out.write(header)
             old_header = header
         elif old_header[:300] != header[:300]:
             #Enough to check <BlastOutput_program> and <BlastOutput_version> match
             out.close()
             h.close()
             raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \
                              % (split_files[0], f, old_header[:300], header[:300]))
         else:
             out.write("    <Iteration>\n")
         for line in h:
             if "</BlastOutput_iterations>" in line:
                 break
             #TODO - Increment <Iteration_iter-num> and if required automatic query names
             #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing?
             out.write(line)
         h.close()
     out.write("  </BlastOutput_iterations>\n")
     out.write("</BlastOutput>\n")
     out.close()