def upload_file_to_shock_and_get_handle(cls, test_file):
        '''
        Uploads the file in test_file to shock and returns the node and a
        handle to the node.
        '''
        node_id = script_utils.upload_file_to_shock(
            shock_service_url=cls.shock_url,
            filePath=test_file,
            ssl_verify=False,
            token=cls.token)['id']

        handle_id = cls.handle.persist_handle({'id': node_id,
                                               'type': 'shock',
                                               'url': cls.shock_url
                                               })
        return node_id, handle_id
예제 #2
0
def transform(shock_service_url=None,
              handle_service_url=None,
              output_file_name=None,
              input_directory=None,
              working_directory=None,
              shock_id=None,
              handle_id=None,
              input_mapping=None,
              mzml_file_name=None,
              polarity=None,
              atlases=None,
              group=None,
              inclusion_order=None,
              normalization_factor=None,
              retention_correction=None,
              level=logging.INFO,
              logger=None):
    """
    Converts mzML file to MetaboliteAtlas2_MAFileInfo json string.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be
                          stored.
                          If the output file name is not specified the name
                          will default
                          to the name of the input file appended with
                           '_finfo'.
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be
                           written to.
        shock_id: Shock id for the hdf file if it already exists in shock
        handle_id: Handle id for the hdf file if it already exists as a
                    handle
        input_mapping: JSON string mapping of input files to expected types.
                       If you don't get this you need to scan the input
                       directory and look for your files.
        level: Logging level, defaults to logging.INFO.
        atlases: List of MetaboliteAtlas atlas IDs.
        mzml_file_name: Name of the file, optional.  Defaults to the file name.
        polarity: Run polarity.
        group: Run group.
        inclusion_order: Run inclusion_order.
        retention_correction: Run retention_correction.
        normalization_factor: Run normalization factor.

    Returns:
        JSON files on disk that can be saved as a KBase workspace objects.

    Authors:
        Steven Silvester
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of mzML to MetaboliteAtlas2.MAFileInfo")
    token = os.environ.get('KB_AUTH_TOKEN')

    if not working_directory or not os.path.isdir(working_directory):
        raise Exception(
            "The working directory {0} is not a valid directory!".format(
                working_directory))

    logger.info("Scanning for mzML files.")

    valid_extensions = [".mzML"]

    files = os.listdir(input_directory)
    mzml_files = [
        x for x in files if os.path.splitext(x)[-1] in valid_extensions
    ]

    assert len(mzml_files) != 0

    logger.info("Found {0} files".format(len(mzml_files)))

    for fname in mzml_files:
        path = os.path.join(input_directory, fname)

        if not os.path.isfile(path):
            raise Exception(
                "The input file name {0} is not a file!".format(path))

        hdf_file = mzml_loader.mzml_to_hdf(path)

        if shock_service_url:
            shock_info = script_utils.upload_file_to_shock(logger,
                                                           shock_service_url,
                                                           hdf_file,
                                                           token=token)

        run_info = dict()
        run_info['mzml_file_name'] = (mzml_file_name
                                      or fname.replace('.mzML', ''))
        run_info['atlases'] = atlases or []
        if polarity is not None:
            run_info['polarity'] = polarity
        if group is not None:
            run_info['group'] = group
        if inclusion_order is not None:
            run_info['inclusion_order'] = inclusion_order
        if normalization_factor is not None:
            run_info['normalization_factor'] = normalization_factor
        if retention_correction is not None:
            run_info['retention_correction'] = retention_correction

        if shock_service_url:
            handle_id = script_utils.getHandles(logger,
                                                shock_service_url,
                                                handle_service_url,
                                                [shock_info["id"]],
                                                token=token)[0]
            run_info["run_file_id"] = handle_id
        else:
            run_info['run_file_id'] = hdf_file

        output_file_name = fname.replace('.mzML', '_finfo.json')

        # This generates the json for the object
        objectString = simplejson.dumps(run_info, sort_keys=True, indent=4)

        output_file_path = os.path.join(working_directory, output_file_name)
        with open(output_file_path, "w") as outFile:
            outFile.write(objectString)

    logger.info("Conversion completed.")
def transform(shock_service_url=None, handle_service_url=None, 
              output_file_name=None, input_directory=None, 
              working_directory=None, shock_id=None, handle_id=None, 
              input_mapping=None, fasta_reference_only=False, 
              level=logging.INFO, logger=None):
    """
    Converts FASTA file to KBaseGenomes.ContigSet json string.  
    Note the MD5 for the contig is generated by uppercasing the sequence.
    The ContigSet MD5 is generated by taking the MD5 of joining the sorted 
    list of individual contig's MD5s with a comma separator.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be stored.  
                          If the output file name is not specified the name will default 
                          to the name of the input file appended with '_contig_set'
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be written to.
        shock_id: Shock id for the fasta file if it already exists in shock
        handle_id: Handle id for the fasta file if it already exists as a handle
        input_mapping: JSON string mapping of input files to expected types.  
                       If you don't get this you need to scan the input 
                       directory and look for your files.
        fasta_reference_only: Creates a reference to the fasta file in Shock, but does not store the sequences in the workspace object.  Not recommended unless the fasta file is larger than 1GB. This is the default behavior for files that large.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.

    Authors:
        Jason Baumohl, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of FASTA to KBaseGenomes.ContigSet")
    token = os.environ.get('KB_AUTH_TOKEN')
        
    if input_mapping is None:
        logger.info("Scanning for FASTA files.")
    
        valid_extensions = [".fa",".fasta",".fna"]
    
        files = os.listdir(input_directory)
        fasta_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
            
        assert len(fasta_files) != 0
    
        logger.info("Found {0}".format(str(fasta_files)))

        input_file_name = os.path.join(input_directory,files[0])
    
        if len(fasta_files) > 1:
            logger.warning("Not sure how to handle multiple FASTA files in this context. Using {0}".format(input_file_name))
    else:
        input_file_name = os.path.join(os.path.join(input_directory, "FASTA.DNA.Assembly"), simplejson.loads(input_mapping)["FASTA.DNA.Assembly"])
        
                
    logger.info("Building Object.")
 
    if not os.path.isfile(input_file_name):
        raise Exception("The input file name {0} is not a file!".format(input_file_name))        

    if not os.path.isdir(args.working_directory):
        raise Exception("The working directory {0} is not a valid directory!".format(working_directory))        

    logger.debug(fasta_reference_only)

    # default if not too large
    contig_set_has_sequences = True 
    if fasta_reference_only:
        contig_set_has_sequences = False 

    fasta_filesize = os.stat(input_file_name).st_size
    if fasta_filesize > 1000000000:
        # Fasta file too large to save sequences into the ContigSet object.
        contigset_warn = """The FASTA input file seems to be too large. A ContigSet
                            object will be created without sequences, but will
                            contain a reference to the file."""
        logger.warning(contigset_warn) 
        contig_set_has_sequences = False 

    input_file_handle = open(input_file_name, 'r')
    
    fasta_header = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    sequence_exists = False
    
    valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNn"
    amino_acid_specific_characters = "PpLlIiFfQqEe" 

    for current_line in input_file_handle:
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_exists) and first_header_found:
                logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
                raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
            if not first_header_found:
                first_header_found = True
            else:
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence :
                    logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
                    raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
                for character in total_sequence:
                    if character not in valid_chars:
                        if character in amino_acid_specific_characters:
                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
#                fasta_key = fasta_header.strip()
                try:
                    fasta_key , fasta_description = fasta_header.strip().split(' ',1)
                except:
                    fasta_key = fasta_header.strip()
                    fasta_description = None
                contig_dict = dict() 
                contig_dict["id"] = fasta_key 
                contig_dict["length"] = len(total_sequence) 
                contig_dict["name"] = fasta_key 
                if fasta_description is None:
                    contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
                else:
                    contig_dict["description"] = "%s.  Note MD5 is generated from uppercasing the sequence" % (fasta_description) 
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest() 
                contig_dict["md5"] = contig_md5 
                contig_set_md5_list.append(contig_md5)
                 
                if contig_set_has_sequences: 
                    contig_dict["sequence"]= total_sequence
                else: 
                    contig_dict["sequence"]= ""
                
                fasta_dict[fasta_key] = contig_dict
               
                # get set up for next fasta sequence
                sequence_list = []
                sequence_exists = False
            
            fasta_header = current_line.replace('>','')
        else:
            sequence_list.append(current_line)
            sequence_exists = True

    input_file_handle.close()

    # wrap up last fasta sequence
    if (not sequence_exists) and first_header_found: 
        logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
        raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
    elif not first_header_found :
        logger.error("There are no contigs in this file") 
        raise Exception("There are no contigs in this file") 
    else: 
        # build up sequence and remove all white space      
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence :
            logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
            raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 

        for character in total_sequence: 
            if character not in valid_chars: 
                if character in amino_acid_specific_characters:
                    raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

#        fasta_key = fasta_header.strip()
        try: 
            fasta_key , fasta_description = fasta_header.strip().split(' ',1)
        except:
            fasta_key = fasta_header.strip()
            fasta_description = None
        contig_dict = dict()
        contig_dict["id"] = fasta_key 
        contig_dict["length"] = len(total_sequence)
        contig_dict["name"] = fasta_key
        if fasta_description is None:
            contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
        else:
            contig_dict["description"] = "%s.  Note MD5 is generated from uppercasing the sequence" % (fasta_description)
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"]= contig_md5
        contig_set_md5_list.append(contig_md5)
        
        if contig_set_has_sequences: 
            contig_dict["sequence"] = total_sequence 
        else:
            contig_dict["sequence"]= ""
         
        fasta_dict[fasta_key] = contig_dict 


    if output_file_name is None:
        # default to input file name minus file extenstion adding "_contig_set" to the end
        base = os.path.basename(input_file_name)
        output_file_name = "{0}_contig_set.json".format(os.path.splitext(base)[0])
    
    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["id"] = output_file_name
    contig_set_dict["name"] = output_file_name
    contig_set_dict["source"] = "KBase"
    contig_set_dict["source_id"] = os.path.basename(input_file_name) 
    contig_set_dict["contigs"] = [fasta_dict[x] for x in sorted(fasta_dict.keys())]

    if shock_id is None:
        shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, input_file_name, token=token)
        shock_id = shock_info["id"]
    
    contig_set_dict["fasta_ref"] = shock_id

    # For future development if the type is updated to the handle_reference instead of a shock_reference

    # This generates the json for the object
    objectString = simplejson.dumps(contig_set_dict, sort_keys=True, indent=4)

    logger.info("ContigSet data structure creation completed.  Writing out JSON.")

    output_file_path = os.path.join(working_directory,output_file_name) 
    with open(output_file_path, "w") as outFile:
        outFile.write(objectString)
    
    logger.info("Conversion completed.")
def upload_assembly(
        shock_service_url=None,
        handle_service_url=None,
        input_directory=None,
        #                    shock_id = None,
        #                  handle_id = None,
        input_mapping=None,
        workspace_name=None,
        workspace_service_url=None,
        taxon_reference=None,
        assembly_name=None,
        source=None,
        date_string=None,
        contig_information_dict=None,
        logger=None):
    """
    Uploads CondensedGenomeAssembly
    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle service.
        shock_id: If the shock id exists use same file (NEEDS TO BE UPDATED TO HANDLE ID)
        input_mapping: (not sure, I think for mapping multiple files, not needed here only 1 file expected)
        workspace_name: Name of ws to load into
        workspace_service_url: URL of WS server instance the WS is on.
        taxon_reference: The ws reference the assembly points to.  (Optional)
        assembly_name: Name of the assembly object to be created. (Optional) (defaults to file_name)
        source: The source of the data (Ex: Refseq)
        date_string: Date (or date range) associated with data. (Optional)
        contig_information_dict: A mapping that has is_circular and description information (Optional)
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.
    Authors:
        Jason Baumohl, Matt Henderson
    """
    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of FASTA to Assembly object")
    token = os.environ.get('KB_AUTH_TOKEN')

    if input_mapping is None:
        logger.info("Scanning for FASTA files.")

        valid_extensions = [".fa", ".fasta", ".fna", ".fas"]

        #        files = os.listdir(input_directory)
        files = os.listdir(os.path.abspath(input_directory))
        fasta_files = [
            x for x in files if os.path.splitext(x)[-1] in valid_extensions
        ]

        if (len(fasta_files) == 0):
            raise Exception(
                "The input file does not have one of the following extensions .fa, .fasta, .fas or .fna"
            )

        logger.info("Found {0}".format(str(fasta_files)))

        fasta_file_name = os.path.join(input_directory, fasta_files[0])

        if len(fasta_files) > 1:
            logger.warning(
                "Not sure how to handle multiple FASTA files in this context. Using {0}"
                .format(fasta_file_name))
    else:
        logger.info("Input Mapping not none : " + str(input_mapping))
        fasta_file_name = os.path.join(
            os.path.join(input_directory, "FASTA.DNA.Assembly"),
            simplejson.loads(input_mapping)["FASTA.DNA.Assembly"])

    logger.info("Building Object.")

    if not os.path.isfile(fasta_file_name):
        raise Exception(
            "The fasta file name {0} is not a file!".format(fasta_file_name))

    if not os.path.isdir(input_directory):
        raise Exception(
            "The input directory {0} is not a valid directory!".format(
                input_directory))

    ws_client = biokbase.workspace.client.Workspace(workspace_service_url)

    workspace_object = ws_client.get_workspace_info(
        {'workspace': workspace_name})

    workspace_id = workspace_object[0]
    workspace_name = workspace_object[1]

    print "FASTA FILE Name :" + fasta_file_name + ":"

    if assembly_name is None:
        base = os.path.basename(fasta_file_name)
        assembly_name = "{0}_assembly".format(os.path.splitext(base)[0])

    ##########################################
    #ASSEMBLY CREATION PORTION  - consume Fasta File
    ##########################################

    logger.info("Starting conversion of FASTA to Assemblies")
    logger.info("Building Assembly Object.")

    input_file_handle = TextFileDecoder.open_textdecoder(
        fasta_file_name, 'ISO-8859-1')
    fasta_header = None
    fasta_description = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    sequence_exists = False

    total_length = 0
    gc_length = 0
    #Note added X and x due to kb|g.1886.fasta
    valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNnXx"
    amino_acid_specific_characters = "PpLlIiFfQqEe"

    #Base_counts - is dict of base characters and their counts.
    base_counts = dict()

    sequence_start = 0
    sequence_stop = 0

    current_line = input_file_handle.readline()
    while current_line != None and len(current_line) > 0:
        #        print "CURRENT LINE: " + current_line
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_exists) and first_header_found:
                logger.error(
                    "There is no sequence related to FASTA record : {0}".
                    format(fasta_header))
                raise Exception(
                    "There is no sequence related to FASTA record : {0}".
                    format(fasta_header))
            if not first_header_found:
                first_header_found = True
                sequence_start = 0
            else:
                sequence_stop = input_file_handle.tell() - len(current_line)
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence:
                    logger.error(
                        "There is no sequence related to FASTA record : {0}".
                        format(fasta_header))
                    raise Exception(
                        "There is no sequence related to FASTA record : {0}".
                        format(fasta_header))
#                for character in total_sequence:
#                    if character not in valid_chars:
#                        if character in amino_acid_specific_characters:
#                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
#                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
                seq_count = collections.Counter(total_sequence.upper())
                seq_dict = dict(seq_count)
                for character in seq_dict:
                    if character in base_counts:
                        base_counts[character] = base_counts[
                            character] + seq_dict[character]
                    else:
                        base_counts[character] = seq_dict[character]
                    if character not in valid_chars:
                        if character in amino_acid_specific_characters:
                            raise Exception(
                                "This fasta file may have amino acids in it instead of the required nucleotides."
                            )
                        raise Exception(
                            "This FASTA file has non nucleic acid characters : {0}"
                            .format(character))

                contig_dict = dict()
                Ncount = 0
                if "N" in seq_dict:
                    Ncount = seq_dict["N"]
                contig_dict["Ncount"] = Ncount
                length = len(total_sequence)
                total_length = total_length + length
                contig_gc_length = len(re.findall('G|g|C|c', total_sequence))
                contig_dict["gc_content"] = float(contig_gc_length) / float(
                    length)
                gc_length = gc_length + contig_gc_length
                fasta_key = fasta_header.strip()
                contig_dict["contig_id"] = fasta_key
                contig_dict["length"] = length
                contig_dict["name"] = fasta_key
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
                contig_dict["md5"] = contig_md5
                contig_set_md5_list.append(contig_md5)

                contig_dict["is_circular"] = "Unknown"
                if fasta_description is not None:
                    contig_dict["description"] = fasta_description
                if contig_information_dict is not None:
                    if contig_information_dict[fasta_key] is not None:
                        if contig_information_dict[fasta_key][
                                "definition"] is not None:
                            contig_dict[
                                "description"] = contig_information_dict[
                                    fasta_key]["definition"]
                        if contig_information_dict[fasta_key][
                                "is_circular"] is not None:
                            contig_dict[
                                "is_circular"] = contig_information_dict[
                                    fasta_key]["is_circular"]
                contig_dict["start_position"] = sequence_start
                contig_dict["num_bytes"] = sequence_stop - sequence_start

                #                print "Sequence Start: " + str(sequence_start) + "Fasta: " + fasta_key
                #                print "Sequence Stop: " + str(sequence_stop) + "Fasta: " + fasta_key

                if fasta_key in fasta_dict:
                    raise Exception(
                        "The fasta header {0} appears more than once in the file "
                        .format(fasta_key))
                else:
                    fasta_dict[fasta_key] = contig_dict

                # get set up for next fasta sequence
                sequence_list = []
                sequence_exists = False

#               sequence_start = input_file_handle.tell()
            sequence_start = 0

            fasta_header_line = current_line.strip().replace('>', '')
            try:
                fasta_header, fasta_description = fasta_header_line.split(
                    ' ', 1)
            except:
                fasta_header = fasta_header_line
                fasta_description = None
        else:
            if sequence_start == 0:
                sequence_start = input_file_handle.tell() - len(current_line)
            sequence_list.append(current_line)
            sequence_exists = True
        current_line = input_file_handle.readline()
#        print "ENDING CURRENT LINE: " + current_line

# wrap up last fasta sequence
    if (not sequence_exists) and first_header_found:
        logger.error(
            "There is no sequence related to FASTA record : {0}".format(
                fasta_header))
        raise Exception(
            "There is no sequence related to FASTA record : {0}".format(
                fasta_header))
    elif not first_header_found:
        logger.error("There are no contigs in this file")
        raise Exception("There are no contigs in this file")
    else:
        sequence_stop = input_file_handle.tell()
        # build up sequence and remove all white space
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence:
            logger.error(
                "There is no sequence related to FASTA record : {0}".format(
                    fasta_header))
            raise Exception(
                "There is no sequence related to FASTA record : {0}".format(
                    fasta_header))

#        for character in total_sequence:
        seq_count = collections.Counter(total_sequence.upper())
        seq_dict = dict(seq_count)
        for character in seq_dict:
            if character in base_counts:
                base_counts[
                    character] = base_counts[character] + seq_dict[character]
            else:
                base_counts[character] = seq_dict[character]
            if character not in valid_chars:
                if character in amino_acid_specific_characters:
                    raise Exception(
                        "This fasta file may have amino acids in it instead of the required nucleotides."
                    )
                raise Exception(
                    "This FASTA file has non nucleic acid characters : {0}".
                    format(character))

        contig_dict = dict()
        Ncount = 0
        if "N" in seq_dict:
            Ncount = seq_dict["N"]
        contig_dict["Ncount"] = Ncount
        length = len(total_sequence)
        total_length = total_length + length
        contig_gc_length = len(re.findall('G|g|C|c', total_sequence))
        contig_dict["gc_content"] = float(contig_gc_length) / float(length)
        gc_length = gc_length + contig_gc_length
        fasta_key = fasta_header.strip()
        contig_dict["contig_id"] = fasta_key
        contig_dict["length"] = length
        contig_dict["name"] = fasta_key

        contig_dict["is_circular"] = "Unknown"
        if fasta_description is not None:
            contig_dict["description"] = fasta_description
        if contig_information_dict is not None:
            if contig_information_dict[fasta_key] is not None:
                if contig_information_dict[fasta_key][
                        "definition"] is not None:
                    contig_dict["description"] = contig_information_dict[
                        fasta_key]["definition"]
                if contig_information_dict[fasta_key][
                        "is_circular"] is not None:
                    contig_dict["is_circular"] = contig_information_dict[
                        fasta_key]["is_circular"]
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"] = contig_md5
        contig_set_md5_list.append(contig_md5)
        contig_dict["start_position"] = sequence_start
        contig_dict["num_bytes"] = sequence_stop - sequence_start

        if fasta_key in fasta_dict:
            raise Exception(
                "The fasta header {0} appears more than once in the file ".
                format(fasta_key))
        else:
            fasta_dict[fasta_key] = contig_dict
        input_file_handle.close()

    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(
        sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["assembly_id"] = assembly_name
    contig_set_dict["name"] = assembly_name
    contig_set_dict["external_source"] = source
    contig_set_dict["external_source_id"] = os.path.basename(fasta_file_name)
    #    contig_set_dict["external_source_origination_date"] = str(os.stat(fasta_file_name).st_ctime)

    if date_string is not None:
        contig_set_dict["external_source_origination_date"] = date_string
    contig_set_dict["contigs"] = fasta_dict
    contig_set_dict["dna_size"] = total_length
    contig_set_dict["gc_content"] = float(gc_length) / float(total_length)
    #    print "Fasta dict Keys :"+",".join(fasta_dict.keys())+":"
    contig_set_dict["num_contigs"] = len(fasta_dict.keys())
    contig_set_dict["type"] = "Unknown"
    contig_set_dict[
        "notes"] = "Note MD5s are generated from uppercasing the sequences"
    contig_set_dict["base_counts"] = base_counts
    if taxon_reference is not None:
        contig_set_dict["taxon_ref"] = taxon_reference

    shock_id = None
    handle_id = None
    if shock_id is None:
        shock_info = script_utils.upload_file_to_shock(logger,
                                                       shock_service_url,
                                                       fasta_file_name,
                                                       token=token)
        shock_id = shock_info["id"]
        handles = script_utils.getHandles(logger, shock_service_url,
                                          handle_service_url, [shock_id],
                                          [handle_id], token)
        handle_id = handles[0]

    contig_set_dict["fasta_handle_ref"] = handle_id

    # For future development if the type is updated to the handle_reference instead of a shock_reference
    assembly_not_saved = True
    assembly_provenance = [{
        "script":
        __file__,
        "script_ver":
        "0.1",
        "description":
        "Generated from fasta files generated from v5 of the CS."
    }]
    while assembly_not_saved:
        try:
            assembly_info = ws_client.save_objects({
                "workspace":
                workspace_name,
                "objects": [{
                    "type": "KBaseGenomeAnnotations.Assembly",
                    "data": contig_set_dict,
                    "name": assembly_name,
                    "provenance": assembly_provenance
                }]
            })
            assembly_not_saved = False
        except biokbase.workspace.client.ServerError as err:
            print "ASSEMBLY SAVE FAILED ON genome " + str(
                assembly_name) + " ERROR: " + str(err)
            raise
        except:
            print "ASSEMBLY SAVE FAILED ON genome " + str(
                assembly_name) + " GENERAL_EXCEPTION: " + str(
                    sys.exc_info()[0])
            raise

    logger.info("Conversion completed.")
def convert(shock_service_url, handle_service_url, input_directory, 
            object_name, level=logging.INFO, logger=None):
    """
    Converts FASTQ file to KBaseAssembly.PairedEndLibrary json string.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        input_directory: Where the FASTQ file can be found.
        object_name: A name to use when storing the JSON string.
        mean_insert: The average insert size.
        std_dev: standard deviation of the inserts
        interleaved: Are the reads interleaved?
        read_orientation: Do the reads have an outward orientation?
        level: Logging level, defaults to logging.INFO.
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of FASTQ to KBaseAssembly.PairedEndLibrary.")

    token = os.environ.get('KB_AUTH_TOKEN')

    # scan the directory for files
    logger.info("Scanning for FASTQ files.")
    
    valid_extensions = [".fq",".fastq",".fnq"]
    
    files = os.listdir(working_directory)
    fastq_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
            
    assert len(fastq_files) != 0
    
    # put the files in shock, get handles
    shock_ids = list()
    for x in fastq_files:
        shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, input_file_name, token=token)
        shock_ids.append(shock_info["id"])
    
    logger.info("Gathering information.")
    handles = script_utils.getHandles(logger, shock_service_url, handle_service_url, shock_ids, [handle_id], token)   
    
    assert len(handles) != 0

    # fill out the object details
    resultObject = dict()
    resultObject["handle_1"] = handles[0]
    
    if len(handles) == 2:
        resultObject["handle_2"] = handles[1]

    if mean_insert is not None :
    	resultObject["insert_size_mean"] = mean_insert
    
    if std_dev is not None:
    	resultObject["insert_size_std_dev"] = std_dev

    if interleaved:    
        resultObject["interleaved"] = 1
    
    if read_orientation:
    	resultObject["read_orientation_outward"] = 1

    objectString = json.dumps(resultObject, sort_keys=True, indent=4)
    
    logger.info("Writing out JSON.")
    with open(args.output_filename, "w") as outFile:
        outFile.write(objectString)
    
    logger.info("Conversion completed.")
def upload_assembly(shock_service_url = None, 
                    handle_service_url = None,
                    input_directory = None,
#                    shock_id = None,
#                  handle_id = None,
                    input_mapping = None,
                    workspace_name = None, 
                    workspace_service_url = None, 
                    taxon_reference = None, 
                    assembly_name = None, 
                    source = None, 
                    date_string = None,
                    contig_information_dict = None,
                    logger = None):

    """
    Uploads CondensedGenomeAssembly
    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle service.
        shock_id: If the shock id exists use same file (NEEDS TO BE UPDATED TO HANDLE ID)
        input_mapping: (not sure, I think for mapping multiple files, not needed here only 1 file expected)
        workspace_name: Name of ws to load into
        workspace_service_url: URL of WS server instance the WS is on.
        taxon_reference: The ws reference the assembly points to.  (Optional)
        assembly_name: Name of the assembly object to be created. (Optional) (defaults to file_name)
        source: The source of the data (Ex: Refseq)
        date_string: Date (or date range) associated with data. (Optional)
        contig_information_dict: A mapping that has is_circular and description information (Optional)
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.
    Authors:
        Jason Baumohl, Matt Henderson
    """
    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of FASTA to Assembly object")
    token = os.environ.get('KB_AUTH_TOKEN')
 
    if input_mapping is None: 
        logger.info("Scanning for FASTA files.")
 
        valid_extensions = [".fa",".fasta",".fna",".fas"] 
 
#        files = os.listdir(input_directory)
        files = os.listdir(os.path.abspath(input_directory))
        fasta_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
 
        if (len(fasta_files) == 0): 
            raise Exception("The input file does not have one of the following extensions .fa, .fasta, .fas or .fna") 
 
 
        logger.info("Found {0}".format(str(fasta_files))) 
 
        fasta_file_name = os.path.join(input_directory,fasta_files[0]) 
 
        if len(fasta_files) > 1: 
            logger.warning("Not sure how to handle multiple FASTA files in this context. Using {0}".format(fasta_file_name)) 
    else: 
        logger.info("Input Mapping not none : " + str(input_mapping))
        fasta_file_name = os.path.join(os.path.join(input_directory, "FASTA.DNA.Assembly"), simplejson.loads(input_mapping)["FASTA.DNA.Assembly"]) 
 
    logger.info("Building Object.") 
 
    if not os.path.isfile(fasta_file_name): 
        raise Exception("The fasta file name {0} is not a file!".format(fasta_file_name)) 
                    
    if not os.path.isdir(input_directory): 
        raise Exception("The input directory {0} is not a valid directory!".format(input_directory)) 

    ws_client = biokbase.workspace.client.Workspace(workspace_service_url)
 
    workspace_object = ws_client.get_workspace_info({'workspace':workspace_name}) 

    workspace_id = workspace_object[0] 
    workspace_name = workspace_object[1] 
    
    print "FASTA FILE Name :"+ fasta_file_name + ":"

    if assembly_name is None:
        base = os.path.basename(fasta_file_name) 
        assembly_name = "{0}_assembly".format(os.path.splitext(base)[0])


    ##########################################
    #ASSEMBLY CREATION PORTION  - consume Fasta File
    ##########################################

    logger.info("Starting conversion of FASTA to Assemblies")
    logger.info("Building Assembly Object.")

    input_file_handle = TextFileDecoder.open_textdecoder(fasta_file_name, 'ISO-8859-1')    
    fasta_header = None
    fasta_description = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    sequence_exists = False
    
    total_length = 0
    gc_length = 0
    #Note added X and x due to kb|g.1886.fasta
    valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNnXx"
    amino_acid_specific_characters = "PpLlIiFfQqEe" 

    #Base_counts - is dict of base characters and their counts.
    base_counts = dict()

    sequence_start = 0
    sequence_stop = 0

    current_line = input_file_handle.readline()
    while current_line != None and len(current_line) > 0:
#        print "CURRENT LINE: " + current_line
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_exists) and first_header_found:
                logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
                raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
            if not first_header_found:
                first_header_found = True
                sequence_start = 0
            else:
                sequence_stop = input_file_handle.tell() - len(current_line)
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence :
                    logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
                    raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
#                for character in total_sequence:
#                    if character not in valid_chars:
#                        if character in amino_acid_specific_characters:
#                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
#                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
                seq_count = collections.Counter(total_sequence.upper())
                seq_dict = dict(seq_count)
                for character in seq_dict:
                    if character in base_counts:
                        base_counts[character] =  base_counts[character] + seq_dict[character]
                    else:
                        base_counts[character] =  seq_dict[character]
                    if character not in valid_chars:
                        if character in amino_acid_specific_characters:
                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

                contig_dict = dict() 
                Ncount = 0
                if "N" in seq_dict:
                    Ncount = seq_dict["N"]
                contig_dict["Ncount"] = Ncount 
                length = len(total_sequence)
                total_length = total_length + length
                contig_gc_length = len(re.findall('G|g|C|c',total_sequence))
                contig_dict["gc_content"] = float(contig_gc_length)/float(length) 
                gc_length = gc_length + contig_gc_length
                fasta_key = fasta_header.strip()
                contig_dict["contig_id"] = fasta_key 
                contig_dict["length"] = length 
                contig_dict["name"] = fasta_key 
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest() 
                contig_dict["md5"] = contig_md5 
                contig_set_md5_list.append(contig_md5)

                contig_dict["is_circular"] = "Unknown"
                if fasta_description is not None: 
                    contig_dict["description"] = fasta_description
                if contig_information_dict is not None:
                    if contig_information_dict[fasta_key] is not None:
                        if contig_information_dict[fasta_key]["definition"] is not None:
                            contig_dict["description"] = contig_information_dict[fasta_key]["definition"]
                        if contig_information_dict[fasta_key]["is_circular"] is not None:
                            contig_dict["is_circular"] = contig_information_dict[fasta_key]["is_circular"]
                contig_dict["start_position"] = sequence_start
                contig_dict["num_bytes"] = sequence_stop - sequence_start

#                print "Sequence Start: " + str(sequence_start) + "Fasta: " + fasta_key
#                print "Sequence Stop: " + str(sequence_stop) + "Fasta: " + fasta_key

                if fasta_key in fasta_dict:
                    raise Exception("The fasta header {0} appears more than once in the file ".format(fasta_key))
                else: 
                    fasta_dict[fasta_key] = contig_dict
               
                # get set up for next fasta sequence
                sequence_list = []
                sequence_exists = False
                
#               sequence_start = input_file_handle.tell()               
            sequence_start = 0            

            fasta_header_line = current_line.strip().replace('>','')
            try:
                fasta_header , fasta_description = fasta_header_line.split(' ',1)
            except:
                fasta_header = fasta_header_line
                fasta_description = None
        else:
            if sequence_start == 0:
                sequence_start = input_file_handle.tell() - len(current_line) 
            sequence_list.append(current_line)
            sequence_exists = True
        current_line = input_file_handle.readline()
#        print "ENDING CURRENT LINE: " + current_line

    # wrap up last fasta sequence
    if (not sequence_exists) and first_header_found: 
        logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
        raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
    elif not first_header_found :
        logger.error("There are no contigs in this file") 
        raise Exception("There are no contigs in this file") 
    else: 
        sequence_stop = input_file_handle.tell()
        # build up sequence and remove all white space      
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence :
            logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
            raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 

#        for character in total_sequence: 
        seq_count = collections.Counter(total_sequence.upper()) 
        seq_dict = dict(seq_count) 
        for character in seq_dict:
            if character in base_counts:
                base_counts[character] =  base_counts[character] + seq_dict[character]
            else:
                base_counts[character] =  seq_dict[character]
            if character not in valid_chars: 
                if character in amino_acid_specific_characters:
                    raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

        contig_dict = dict() 
        Ncount = 0
        if "N" in seq_dict:
            Ncount = seq_dict["N"]
        contig_dict["Ncount"] = Ncount 
        length = len(total_sequence)
        total_length = total_length + length
        contig_gc_length = len(re.findall('G|g|C|c',total_sequence))
        contig_dict["gc_content"] = float(contig_gc_length)/float(length) 
        gc_length = gc_length + contig_gc_length
        fasta_key = fasta_header.strip()
        contig_dict["contig_id"] = fasta_key 
        contig_dict["length"] = length
        contig_dict["name"] = fasta_key

        contig_dict["is_circular"] = "Unknown"
        if fasta_description is not None:
            contig_dict["description"] = fasta_description
        if contig_information_dict is not None: 
            if contig_information_dict[fasta_key] is not None:
                if contig_information_dict[fasta_key]["definition"] is not None:
                    contig_dict["description"] = contig_information_dict[fasta_key]["definition"]
                if contig_information_dict[fasta_key]["is_circular"] is not None:
                    contig_dict["is_circular"] = contig_information_dict[fasta_key]["is_circular"]
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"]= contig_md5
        contig_set_md5_list.append(contig_md5)
        contig_dict["start_position"] = sequence_start
        contig_dict["num_bytes"] = sequence_stop - sequence_start
        
        if fasta_key in fasta_dict:
            raise Exception("The fasta header {0} appears more than once in the file ".format(fasta_key))
        else: 
            fasta_dict[fasta_key] = contig_dict
        input_file_handle.close()

    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["assembly_id"] = assembly_name
    contig_set_dict["name"] = assembly_name
    contig_set_dict["external_source"] = source
    contig_set_dict["external_source_id"] = os.path.basename(fasta_file_name) 
#    contig_set_dict["external_source_origination_date"] = str(os.stat(fasta_file_name).st_ctime)

    if date_string is not None:
        contig_set_dict["external_source_origination_date"] = date_string
    contig_set_dict["contigs"] = fasta_dict
    contig_set_dict["dna_size"] = total_length
    contig_set_dict["gc_content"] = float(gc_length)/float(total_length)
#    print "Fasta dict Keys :"+",".join(fasta_dict.keys())+":" 
    contig_set_dict["num_contigs"] = len(fasta_dict.keys())
    contig_set_dict["type"] = "Unknown"
    contig_set_dict["notes"] = "Note MD5s are generated from uppercasing the sequences" 
    contig_set_dict["base_counts"] = base_counts 
    if taxon_reference is not None:
        contig_set_dict["taxon_ref"] = taxon_reference


    shock_id = None
    handle_id = None
    if shock_id is None:
        shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, fasta_file_name, token=token)
        shock_id = shock_info["id"]
        handles = script_utils.getHandles(logger, shock_service_url, handle_service_url, [shock_id], [handle_id], token)   
        handle_id = handles[0]

    contig_set_dict["fasta_handle_ref"] = handle_id

    # For future development if the type is updated to the handle_reference instead of a shock_reference
    assembly_not_saved = True 
    assembly_provenance = [{"script": __file__, "script_ver": "0.1", "description": "Generated from fasta files generated from v5 of the CS."}]
    while assembly_not_saved: 
        try: 
            assembly_info =  ws_client.save_objects({"workspace": workspace_name,"objects":[ 
                {"type":"KBaseGenomeAnnotations.Assembly", 
                 "data":contig_set_dict, 
                 "name": assembly_name, 
                 "provenance":assembly_provenance}]}) 
            assembly_not_saved = False 
        except biokbase.workspace.client.ServerError as err: 
            print "ASSEMBLY SAVE FAILED ON genome " + str(assembly_name) + " ERROR: " + str(err) 
            raise 
        except: 
            print "ASSEMBLY SAVE FAILED ON genome " + str(assembly_name) + " GENERAL_EXCEPTION: " + str(sys.exc_info()[0]) 
            raise 
    
    logger.info("Conversion completed.")
예제 #7
0
def convert(shock_service_url,
            handle_service_url,
            input_directory,
            object_name,
            level=logging.INFO,
            logger=None):
    """
    Converts FASTQ file to KBaseAssembly.PairedEndLibrary json string.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        input_directory: Where the FASTQ file can be found.
        object_name: A name to use when storing the JSON string.
        mean_insert: The average insert size.
        std_dev: standard deviation of the inserts
        interleaved: Are the reads interleaved?
        read_orientation: Do the reads have an outward orientation?
        level: Logging level, defaults to logging.INFO.
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of FASTQ to KBaseAssembly.PairedEndLibrary.")

    token = os.environ.get('KB_AUTH_TOKEN')

    # scan the directory for files
    logger.info("Scanning for FASTQ files.")

    valid_extensions = [".fq", ".fastq", ".fnq"]

    files = os.listdir(working_directory)
    fastq_files = [
        x for x in files if os.path.splitext(x)[-1] in valid_extensions
    ]

    assert len(fastq_files) != 0

    # put the files in shock, get handles
    shock_ids = list()
    for x in fastq_files:
        shock_info = script_utils.upload_file_to_shock(logger,
                                                       shock_service_url,
                                                       input_file_name,
                                                       token=token)
        shock_ids.append(shock_info["id"])

    logger.info("Gathering information.")
    handles = script_utils.getHandles(logger, shock_service_url,
                                      handle_service_url, shock_ids,
                                      [handle_id], token)

    assert len(handles) != 0

    # fill out the object details
    resultObject = dict()
    resultObject["handle_1"] = handles[0]

    if len(handles) == 2:
        resultObject["handle_2"] = handles[1]

    if mean_insert is not None:
        resultObject["insert_size_mean"] = mean_insert

    if std_dev is not None:
        resultObject["insert_size_std_dev"] = std_dev

    if interleaved:
        resultObject["interleaved"] = 1

    if read_orientation:
        resultObject["read_orientation_outward"] = 1

    objectString = json.dumps(resultObject, sort_keys=True, indent=4)

    logger.info("Writing out JSON.")
    with open(args.output_filename, "w") as outFile:
        outFile.write(objectString)

    logger.info("Conversion completed.")
def transform(
    shock_service_url=None,
    handle_service_url=None,
    output_file_name=None,
    input_directory=None,
    working_directory=None,
    level=logging.INFO,
    logger=None,
):
    """
    Converts a FASTQ file to a KBaseAssembly.SingleEndLibrary json string.  

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be stored.  
        input_directory: The directory containing the file.
        working_directory: The directory the resulting json file will be written to.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.

    Authors:
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Scanning for FASTQ files.")

    valid_extensions = [".fq", ".fastq", ".fnq"]

    files = os.listdir(working_directory)
    fastq_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]

    assert len(fastq_files) != 0

    logger.info("Found {0}".format(str(fastq_files)))

    input_file_name = files[0]

    if len(fastq_files) > 1:
        logger.warning("Not sure how to handle multiple FASTQ files in this context. Using {0}".format(input_file_name))

    kb_token = os.environ.get("KB_AUTH_TOKEN")

    script_utils.upload_file_to_shock(
        logger=logger,
        shock_service_url=shock_service_url,
        filePath=os.path.join(input_directory, input_file_name),
        token=kb_token,
    )

    handles = script_utils.getHandles(
        logger=logger, shock_service_url=shock_service_url, handle_service_url=handle_service_url, token=kb_token
    )

    assert len(handles) != 0

    objectString = simplejson.dumps({"handle": handles[0]}, sort_keys=True, indent=4)

    if output_file_name is None:
        output_file_name = input_file_name

    with open(os.path.join(output_directory, output_file_name), "w") as f:
        f.write(objectString)
예제 #9
0
            # to the same version of Python that Narrative uses, which is currently
            # Python 2.7.6, after which this workaround can be removed                    
            if total < 2**31:
                archive_name = os.path.join(working_directory, name) + ".zip"
                with zipfile.ZipFile(archive_name, 'w', zipfile.ZIP_DEFLATED) as archive:
                    for n in files:
                        archive.write(n, arcname=os.path.join(name, n.split(transform_directory + os.sep)[1]))
            else:
                archive_name = os.path.join(working_directory, name) + ".tar.bz2"
                with tarfile.open(archive_name, 'w:bz2') as archive:
                    for n in files:
                        archive.add(n, arcname=os.path.join(name, n.split(transform_directory + os.sep)[1]))
                
        
            shock_info = script_utils.upload_file_to_shock(logger = logger,
                                                           shock_service_url = shock_service_url,
                                                           filePath = archive_name,
                                                           token= kb_token)
            shock_id = shock_info["id"]
        except Exception, e:
            logger.debug("Caught exception while creating archive and sending to SHOCK!")

            if ujs_job_id is not None:
                error_object["status"] = "ERROR : Archive creation failed - {0}".format(e.message)[:handler_utils.UJS_STATUS_MAX]
                error_object["error_message"] = traceback.format_exc()
            
                handler_utils.report_exception(logger, error_object, cleanup_details)

                ujs.complete_job(ujs_job_id, 
                                 kb_token, 
                                 "Download from {0} failed.".format(workspace_name), 
                                 traceback.format_exc(), 
def transform(shock_service_url=None,
              handle_service_url=None,
              output_file_name=None,
              input_directory=None,
              working_directory=None,
              level=logging.INFO,
              logger=None):
    """
    Converts a FASTQ file to a KBaseAssembly.SingleEndLibrary json string.  

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be stored.  
        input_directory: The directory containing the file.
        working_directory: The directory the resulting json file will be written to.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.

    Authors:
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Scanning for FASTQ files.")

    valid_extensions = [".fq", ".fastq", ".fnq"]

    files = os.listdir(working_directory)
    fastq_files = [
        x for x in files if os.path.splitext(x)[-1] in valid_extensions
    ]

    assert len(fastq_files) != 0

    logger.info("Found {0}".format(str(fastq_files)))

    input_file_name = files[0]

    if len(fastq_files) > 1:
        logger.warning(
            "Not sure how to handle multiple FASTQ files in this context. Using {0}"
            .format(input_file_name))

    kb_token = os.environ.get('KB_AUTH_TOKEN')

    script_utils.upload_file_to_shock(logger=logger,
                                      shock_service_url=shock_service_url,
                                      filePath=os.path.join(
                                          input_directory, input_file_name),
                                      token=kb_token)

    handles = script_utils.getHandles(logger=logger,
                                      shock_service_url=shock_service_url,
                                      handle_service_url=handle_service_url,
                                      token=kb_token)

    assert len(handles) != 0

    objectString = simplejson.dumps({"handle": handles[0]},
                                    sort_keys=True,
                                    indent=4)

    if output_file_name is None:
        output_file_name = input_file_name

    with open(os.path.join(output_directory, output_file_name), "w") as f:
        f.write(objectString)
def transform(shock_service_url=None, handle_service_url=None,
              output_file_name=None, input_directory=None,
              working_directory=None, shock_id=None, handle_id=None,
              input_mapping=None, mzml_file_name=None, polarity=None,
              atlases=None, group=None, inclusion_order=None,
              normalization_factor=None, retention_correction=None,
              level=logging.INFO, logger=None):
    """
    Converts mzML file to MetaboliteAtlas2_MAFileInfo json string.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be
                          stored.
                          If the output file name is not specified the name
                          will default
                          to the name of the input file appended with
                           '_finfo'.
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be
                           written to.
        shock_id: Shock id for the hdf file if it already exists in shock
        handle_id: Handle id for the hdf file if it already exists as a
                    handle
        input_mapping: JSON string mapping of input files to expected types.
                       If you don't get this you need to scan the input
                       directory and look for your files.
        level: Logging level, defaults to logging.INFO.
        atlases: List of MetaboliteAtlas atlas IDs.
        mzml_file_name: Name of the file, optional.  Defaults to the file name.
        polarity: Run polarity.
        group: Run group.
        inclusion_order: Run inclusion_order.
        retention_correction: Run retention_correction.
        normalization_factor: Run normalization factor.

    Returns:
        JSON files on disk that can be saved as a KBase workspace objects.

    Authors:
        Steven Silvester
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of mzML to MetaboliteAtlas2.MAFileInfo")
    token = os.environ.get('KB_AUTH_TOKEN')

    if not working_directory or not os.path.isdir(working_directory):
        raise Exception("The working directory {0} is not a valid directory!"
                        .format(working_directory))

    logger.info("Scanning for mzML files.")

    valid_extensions = [".mzML"]

    files = os.listdir(input_directory)
    mzml_files = [x for x in files
                  if os.path.splitext(x)[-1] in valid_extensions]

    assert len(mzml_files) != 0

    logger.info("Found {0} files".format(len(mzml_files)))

    for fname in mzml_files:
        path = os.path.join(input_directory, fname)

        if not os.path.isfile(path):
            raise Exception("The input file name {0} is not a file!"
                            .format(path))

        hdf_file = mzml_loader.mzml_to_hdf(path)

        if shock_service_url:
            shock_info = script_utils.upload_file_to_shock(logger,
                    shock_service_url, hdf_file, token=token)

        run_info = dict()
        run_info['mzml_file_name'] = (mzml_file_name or
                                      fname.replace('.mzML', ''))
        run_info['atlases'] = atlases or []
        if polarity is not None:
            run_info['polarity'] = polarity
        if group is not None:
            run_info['group'] = group
        if inclusion_order is not None:
            run_info['inclusion_order'] = inclusion_order
        if normalization_factor is not None:
            run_info['normalization_factor'] = normalization_factor
        if retention_correction is not None:
            run_info['retention_correction'] = retention_correction

        if shock_service_url:
            handle_id = script_utils.getHandles(logger, shock_service_url,
                    handle_service_url, [shock_info["id"]], token=token)[0]
            run_info["run_file_id"] = handle_id
        else:
            run_info['run_file_id'] = hdf_file

        output_file_name = fname.replace('.mzML', '_finfo.json')

        # This generates the json for the object
        objectString = simplejson.dumps(run_info, sort_keys=True, indent=4)

        output_file_path = os.path.join(working_directory, output_file_name)
        with open(output_file_path, "w") as outFile:
            outFile.write(objectString)

    logger.info("Conversion completed.")
예제 #12
0
def transform(shock_service_url=None, handle_service_url=None, 
              output_file_name=None, input_directory=None, 
              working_directory=None, shock_id=None, handle_id=None, 
              input_mapping=None, fasta_reference_only=False, 
              level=logging.INFO, logger=None):
    """
    Converts FASTA file to KBaseGenomes.ContigSet json string.  
    Note the MD5 for the contig is generated by uppercasing the sequence.
    The ContigSet MD5 is generated by taking the MD5 of joining the sorted 
    list of individual contig's MD5s with a comma separator.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be stored.  
                          If the output file name is not specified the name will default 
                          to the name of the input file appended with '_contig_set'
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be written to.
        shock_id: Shock id for the fasta file if it already exists in shock
        handle_id: Handle id for the fasta file if it already exists as a handle
        input_mapping: JSON string mapping of input files to expected types.  
                       If you don't get this you need to scan the input 
                       directory and look for your files.
        fasta_reference_only: Creates a reference to the fasta file in Shock, but does not store the sequences in the workspace object.  Not recommended unless the fasta file is larger than 1GB. This is the default behavior for files that large.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.

    Authors:
        Jason Baumohl, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of FASTA to KBaseGenomes.ContigSet")
    token = os.environ.get('KB_AUTH_TOKEN')
        
    if input_mapping is None:
        logger.info("Scanning for FASTA files.")
    
        valid_extensions = [".fa",".fasta",".fna",".fas"]
    
        files = os.listdir(input_directory)
        fasta_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
            
        if (len(fasta_files) == 0):
            raise Exception("The input file does not have one of the following extensions .fa, .fasta, .fas or .fna")        

    
        logger.info("Found {0}".format(str(fasta_files)))

        input_file_name = os.path.join(input_directory,files[0])
    
        if len(fasta_files) > 1:
            logger.warning("Not sure how to handle multiple FASTA files in this context. Using {0}".format(input_file_name))
    else:
        input_file_name = os.path.join(os.path.join(input_directory, "FASTA.DNA.Assembly"), simplejson.loads(input_mapping)["FASTA.DNA.Assembly"])
        
                
    logger.info("Building Object.")
 
    if not os.path.isfile(input_file_name):
        raise Exception("The input file name {0} is not a file!".format(input_file_name))        

    if not os.path.isdir(working_directory):
        raise Exception("The working directory {0} is not a valid directory!".format(working_directory))        

    logger.debug(fasta_reference_only)

    # default if not too large
    contig_set_has_sequences = True 
    if fasta_reference_only:
        contig_set_has_sequences = False 

    fasta_filesize = os.stat(input_file_name).st_size
    if fasta_filesize > 1000000000:
        # Fasta file too large to save sequences into the ContigSet object.
        contigset_warn = """The FASTA input file seems to be too large. A ContigSet
                            object will be created without sequences, but will
                            contain a reference to the file."""
        logger.warning(contigset_warn) 
        contig_set_has_sequences = False 

    input_file_handle = open(input_file_name, 'r')
    
    fasta_header = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    sequence_exists = False
    
    valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNn"
    amino_acid_specific_characters = "PpLlIiFfQqEe" 

    for current_line in input_file_handle:
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_exists) and first_header_found:
                logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
                raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
            if not first_header_found:
                first_header_found = True
            else:
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence :
                    logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
                    raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
#                for character in total_sequence:
                seq_count = collections.Counter(total_sequence)
                seq_dict = dict(seq_count)
                for character in seq_dict:
                    if character not in valid_chars:
                        if character in amino_acid_specific_characters:
                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
#                fasta_key = fasta_header.strip()
                try:
                    fasta_key , fasta_description = fasta_header.strip().split(' ',1)
                except:
                    fasta_key = fasta_header.strip()
                    fasta_description = None

                if fasta_key == '':
                    raise Exception("One fasta header lines '>' does not have an identifier associated with it")
                contig_dict = dict() 
                contig_dict["id"] = fasta_key 
                contig_dict["length"] = len(total_sequence) 
                contig_dict["name"] = fasta_key
                if fasta_description is None:
                    contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
                else:
                    contig_dict["description"] = "%s.  Note MD5 is generated from uppercasing the sequence" % (fasta_description)
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest() 
                contig_dict["md5"] = contig_md5 
                contig_set_md5_list.append(contig_md5)
                 
                if contig_set_has_sequences: 
                    contig_dict["sequence"]= total_sequence
                else: 
                    contig_dict["sequence"]= ""
                
                if fasta_key in fasta_dict:
                    raise Exception("The fasta header {0} appears more than once in the file ".format(fasta_key)) 
                else:
                    fasta_dict[fasta_key] = contig_dict                 
               
                # get set up for next fasta sequence
                sequence_list = []
                sequence_exists = False
            
            fasta_header = current_line.replace('>','')
        else:
            sequence_list.append(current_line)
            sequence_exists = True

    input_file_handle.close()

    # wrap up last fasta sequence
    if (not sequence_exists) and first_header_found: 
        logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
        raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
    elif not first_header_found :
        logger.error("There are no contigs in this file") 
        raise Exception("There are no contigs in this file") 
    else: 
        # build up sequence and remove all white space      
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence :
            logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
            raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 

#        for character in total_sequence: 
        seq_count = collections.Counter(total_sequence)
        seq_dict = dict(seq_count)
        for character in seq_dict:
            if character not in valid_chars: 
                if character in amino_acid_specific_characters:
                    raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

#        fasta_key = fasta_header.strip()
        try: 
            fasta_key , fasta_description = fasta_header.strip().split(' ',1)
        except:
            fasta_key = fasta_header.strip()
            fasta_description = None
 
        if fasta_key == '':
            raise Exception("One fasta header lines '>' does not have an identifier associated with it")
        contig_dict = dict()
        contig_dict["id"] = fasta_key 
        contig_dict["length"] = len(total_sequence)
        contig_dict["name"] = fasta_key
 
        if fasta_description is None: 
            contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
        else: 
            contig_dict["description"] = "%s.  Note MD5 is generated from uppercasing the sequence" % (fasta_description) 
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"]= contig_md5
        contig_set_md5_list.append(contig_md5)
        
        if contig_set_has_sequences: 
            contig_dict["sequence"] = total_sequence 
        else:
            contig_dict["sequence"]= ""
        if fasta_key in fasta_dict:
            raise Exception("The fasta header {0} appears more than once in the file ".format(fasta_key)) 
        else:
            fasta_dict[fasta_key] = contig_dict 


    if output_file_name is None:
        # default to input file name minus file extenstion adding "_contig_set" to the end
        base = os.path.basename(input_file_name)
        output_file_name = "{0}_contig_set.json".format(os.path.splitext(base)[0])
    
    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["id"] = output_file_name
    contig_set_dict["name"] = output_file_name
    contig_set_dict["source"] = "KBase"
    contig_set_dict["source_id"] = os.path.basename(input_file_name) 
    contig_set_dict["contigs"] = [fasta_dict[x] for x in sorted(fasta_dict.keys())]

    if shock_id is None:
        shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, input_file_name, token=token)
        shock_id = shock_info["id"]
    
    contig_set_dict["fasta_ref"] = shock_id

    # For future development if the type is updated to the handle_reference instead of a shock_reference

    # This generates the json for the object
    objectString = simplejson.dumps(contig_set_dict, sort_keys=True, indent=4)
    if len(contig_set_dict["contigs"]) == 0:
        raise Exception("There appears to be no FASTA DNA Sequences in the input file.") 
    #The workspace has a 1GB limit
    if sys.getsizeof(objectString) > 1E9 :
        contig_set_dict["contigs"] = []
        objectString = simplejson.dumps(contig_set_dict, sort_keys=True, indent=4)
        logger.warning("The fasta file has a very large number of contigs thus resulting in an object being too large if " 
                       "the contigs are to have metadata. The resulting contigset will not have individual metadata for the contigs.")

    logger.info("ContigSet data structure creation completed.  Writing out JSON.")

    output_file_path = os.path.join(working_directory,output_file_name) 
    with open(output_file_path, "w") as outFile:
        outFile.write(objectString)
    
    logger.info("Conversion completed.")
예제 #13
0
def transform(shock_service_url=None, 
              handle_service_url=None, 
              #output_file_name=None, 
              input_fasta_directory=None, 
              #working_directory=None, shock_id=None, handle_id=None, 
              #input_mapping=None, fasta_reference_only=False,
              wsname=None,
              wsurl=None,
              genome_list_file=None,
#              taxon_wsname=None,
#              taxon_names_file=None,
              level=logging.INFO, logger=None):
    """
    Uploads KBaseGenomeAnnotations.Assembly
    Args:
        shock_service_url: A url for the KBase SHOCK service.
        input_fasta_directory: The directory where files will be read from.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.
    Authors:
        Jason Baumohl, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    assembly_ws_client = biokbase.workspace.client.Workspace(wsurl)
#    assembly_ws_client = doekbase.workspace.client.Workspace(wsurl)
 
    assembly_workspace_object = assembly_ws_client.get_workspace_info({'workspace':wsname}) 
 
#    taxon_ws_client = doekbase.workspace.client.Workspace(wsurl)
 
#    taxon_workspace_object = ws_client.get_workspace_info({'workspace':taxon_wsname}) 
 
    workspace_id = assembly_workspace_object[0] 
    workspace_name = assembly_workspace_object[1] 


#    #key scientific name, value is taxon object name (taxid_taxon)
#    scientific_names_lookup = dict()
#    taxon_names_file = taxon_names_file[0]

#    if os.path.isfile(taxon_names_file): 
#        print "Found taxon_names_File" 
#        name_f = open(taxon_names_file, 'r') 
#        counter = 0 
#        for name_line in name_f: 
#            temp_list = re.split(r'\t*\|\t*', name_line) 
#            if temp_list[3] == "scientific name": 
#                scientific_names_lookup[temp_list[1]] = "%s_taxon" % (str(temp_list[0]))
#        name_f.close()


    genomes_list = list()
#    genome_list_file = genome_list_file[0]
    if os.path.isfile(genome_list_file): 
        print "Found Genome_list_File" 
    genomes_f = open(genome_list_file, 'r') 
    for genome_line in genomes_f: 
        temp_list = re.split(r'\n*', genome_line)
        genomes_list.append(temp_list[0])
    genomes_f.close()

    logger.info("Starting conversion of FASTA to Assemblies")
    token = os.environ.get('KB_AUTH_TOKEN')
        
#    if input_mapping is None:
#        logger.info("Scanning for FASTA files.")
#        valid_extensions = [".fa",".fasta",".fna"]
#        files = os.listdir(input_directory)
#        fasta_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
#        assert len(fasta_files) != 0
#        logger.info("Found {0}".format(str(fasta_files)))
#        input_file_name = os.path.join(input_directory,files[0])
#        if len(fasta_files) > 1:
#            logger.warning("Not sure how to handle multiple FASTA files in this context. Using {0}".format(input_file_name))
#    else:
#        input_file_name = os.path.join(os.path.join(input_directory, "FASTA.DNA.Assembly"), simplejson.loads(input_mapping)["FASTA.DNA.Assembly"])
        
    for genome_id in genomes_list:

        logger.info("Building Object.")

        temp_genome_id = genome_id
        temp_genome_id.replace("|","\|")
        input_file_name = "%s/%s.fasta" % (input_fasta_directory,temp_genome_id) 
        if not os.path.isfile(input_file_name):
            raise Exception("The input file name {0} is not a file!".format(input_file_name))        

#        if not os.path.isdir(args.working_directory):
#            raise Exception("The working directory {0} is not a valid directory!".format(working_directory))        

#        logger.debug(fasta_reference_only)


        input_file_handle = TextFileDecoder.open_textdecoder(input_file_name, 'ISO-8859-1')
 #   input_file_handle = open(input_file_name, 'r')
    
        fasta_header = None
        sequence_list = []
        fasta_dict = dict()
        first_header_found = False
        contig_set_md5_list = []
        # Pattern for replacing white space
        pattern = re.compile(r'\s+')
        sequence_exists = False
    
        total_length = 0
        gc_length = 0
        #Note added X and x due to kb|g.1886.fasta
        valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNnXx"
        amino_acid_specific_characters = "PpLlIiFfQqEe" 

        sequence_start = 0
        sequence_stop = 0

        current_line = input_file_handle.readline()
#    for current_line in input_file_handle:
        while current_line != None and len(current_line) > 0:

#        print "CURRENT LINE: " + current_line
            if (current_line[0] == ">"):
                # found a header line
                # Wrap up previous fasta sequence
                if (not sequence_exists) and first_header_found:
                    logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
                    raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
                if not first_header_found:
                    first_header_found = True
                #                sequence_start = input_file_handle.tell()
                    sequence_start = 0
                else:
                    sequence_stop = input_file_handle.tell() - len(current_line)
                    # build up sequence and remove all white space
                    total_sequence = ''.join(sequence_list)
                    total_sequence = re.sub(pattern, '', total_sequence)
                    if not total_sequence :
                        logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
                        raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
                    for character in total_sequence:
                        if character not in valid_chars:
                            if character in amino_acid_specific_characters:
                                raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                            raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
                    length = len(total_sequence)
                    total_length = total_length + length
                    contig_gc_length = len(re.findall('G|g|C|c',total_sequence))
                    contig_dict = dict() 
                    contig_dict["gc_content"] = float(contig_gc_length)/float(length) 
                    gc_length = gc_length + contig_gc_length
                    fasta_key = fasta_header.strip()
                    contig_dict["contig_id"] = fasta_key 
                    contig_dict["length"] = length 
                    contig_dict["name"] = fasta_key 
                    contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
                    contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest() 
                    contig_dict["md5"] = contig_md5 
                    contig_set_md5_list.append(contig_md5)
                    contig_dict["is_circular"] = "unknown"
                    contig_dict["start_position"] = sequence_start
                    contig_dict["num_bytes"] = sequence_stop - sequence_start


#                    print "Sequence Start: " + str(sequence_start) + "Fasta: " + fasta_key
#                    print "Sequence Stop: " + str(sequence_stop) + "Fasta: " + fasta_key
                    fasta_dict[fasta_key] = contig_dict
               
                    # get set up for next fasta sequence
                    sequence_list = []
                    sequence_exists = False
                
#                    sequence_start = input_file_handle.tell()               
                sequence_start = 0            

                fasta_header = current_line.replace('>','')
            else:
                if sequence_start == 0:
                    sequence_start = input_file_handle.tell() - len(current_line) 
                sequence_list.append(current_line)
                sequence_exists = True
            current_line = input_file_handle.readline()

        # wrap up last fasta sequence
        if (not sequence_exists) and first_header_found: 
            logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
            raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
        elif not first_header_found :
            logger.error("There are no contigs in this file") 
            raise Exception("There are no contigs in this file") 
        else: 
            sequence_stop = input_file_handle.tell()
            # build up sequence and remove all white space      
            total_sequence = ''.join(sequence_list)
            total_sequence = re.sub(pattern, '', total_sequence)
            if not total_sequence :
                logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
                raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 

            for character in total_sequence: 
                if character not in valid_chars: 
                    if character in amino_acid_specific_characters:
                        raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                    raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

            length = len(total_sequence)
            total_length = total_length + length
            contig_gc_length = len(re.findall('G|g|C|c',total_sequence))
            contig_dict = dict()
            contig_dict["gc_content"] = float(contig_gc_length)/float(length) 
            gc_length = gc_length + contig_gc_length
            fasta_key = fasta_header.strip()
            contig_dict["contig_id"] = fasta_key 
            contig_dict["length"] = length
            contig_dict["name"] = fasta_key
            contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
            contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
            contig_dict["md5"]= contig_md5
            contig_set_md5_list.append(contig_md5)
            contig_dict["is_circular"] = "unknown"
            contig_dict["start_position"] = sequence_start
            contig_dict["num_bytes"] = sequence_stop - sequence_start
        
            fasta_dict[fasta_key] = contig_dict 
        input_file_handle.close()

#        if output_file_name is None:
#            # default to input file name minus file extenstion adding "_contig_set" to the end
#            base = os.path.basename(input_file_name)
#            output_file_name = "{0}_contig_set.json".format(os.path.splitext(base)[0])
    
        contig_set_dict = dict()
        contig_set_dict["md5"] = hashlib.md5(",".join(sorted(contig_set_md5_list))).hexdigest()
        contig_set_dict["assembly_id"] = genome_id
        contig_set_dict["name"] = genome_id
        contig_set_dict["external_source"] = "KBase"
        contig_set_dict["external_source_id"] = os.path.basename(input_file_name) 
        contig_set_dict["external_source_origination_date"] = str(os.stat(input_file_name).st_ctime)
        contig_set_dict["contigs"] = fasta_dict
        contig_set_dict["dna_size"] = total_length
        contig_set_dict["gc_content"] = float(gc_length)/float(total_length)
        contig_set_dict["num_contigs"] = len(fasta_dict.keys())
        contig_set_dict["type"] = "Unknown"
        contig_set_dict["notes"] = "Unknown"

        shock_id = None
        handle_id = None 
        if shock_id is None:
            shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, input_file_name, token=token)
            shock_id = shock_info["id"]
            handles = script_utils.getHandles(logger, shock_service_url, handle_service_url, [shock_id], [handle_id], token)
            handle_id = handles[0]     

        contig_set_dict["fasta_handle_ref"] = handle_id

        # For future development if the type is updated to the handle_reference instead of a shock_reference


        assembly_not_saved = True 
        assembly_provenance = [{"script": __file__, "script_ver": "0.1", "description": "Generated from fasta files generated from v5 of the CS."}]
        while assembly_not_saved: 
            try: 
                assembly_info =  assembly_ws_client.save_objects({"workspace": workspace_name,"objects":[ 
                            {"type":"KBaseGenomeAnnotations.Assembly", 
                             "data":contig_set_dict, 
                             "name": "%s_assembly" % (genome_id), 
                             "provenance":assembly_provenance}]}) 
                assembly_not_saved = False 
            except biokbase.workspace.client.ServerError as err:
#            except doekbase.workspace.client.ServerError as err:
                print "SAVE FAILED ON genome " + str(genome_id) + " ERROR: " + err 
                raise 
            except: 
                print "SAVE FAILED ON genome " + str(genome_id) + " GENERAL_EXCEPTION: " + str(sys.exc_info()[0]) 
                raise 
    
        logger.info("Conversion completed.")