Exemple #1
0
def _load_from_cache(cache_file_name, logger):
    """
    Load from cache file
    """
    try:
        if not os.path.exists(cache_file_name):
            return None, None, None

        logger.debug("  Loading taxa data from cache")
        data_file = open(cache_file_name, 'rb')

        #
        #   Is file version correct
        #
        latest_version, errmsg = check_cache_file_version(
            data_file, _FILE_VERSION_MAJ)
        if not latest_version:
            logger.warning(errmsg)
            return None, None, None

        #
        #   taxon data
        #
        taxon_id_to_scientific_name = marshal.load(data_file)
        taxon_id_to_common_name = marshal.load(data_file)
        taxon_id_to_parent = marshal.load(data_file)
        return (taxon_id_to_scientific_name, taxon_id_to_common_name,
                taxon_id_to_parent)

    except:
        raise
        return None, None, None
def _load_from_cache (cache_file_name, logger, data_to_load = None):
    """
    Load tree data from cache file
        data_to_load is a list or set of the following values
                "alignment", "node", "node_to_parent" which determines what data to load
                usually just node or node_to_parent is useful.
    """
    start_time = time.time()
    try:
        if not os.path.exists(cache_file_name):
            return None, None, None

        logger.debug("  Loading protein tree from cache")
        logger.info("  Loading cache file = %s" % cache_file_name)
        data_file = open(cache_file_name, 'rb')

        #
        #   Is file version correct
        #
        latest_version, errmsg = check_cache_file_version (data_file, _FILE_VERSION_MAJ)
        if not latest_version:
            logger.warning(errmsg)
            return None, None, None

        taxon_id_to_species_data                = cPickle.load(data_file)
        taxon_ids_to_ortholog_species_pair   = cPickle.load(data_file)
        return taxon_id_to_species_data, taxon_ids_to_ortholog_species_pair

    except:
        raise
        return None, None
def _load_feature_types_from_cache(cache_file_name, logger):
    """
    Load features from cache file
    """
    try:
        if not os.path.exists(cache_file_name):
            return None

        logger.debug("  Loading feature types from cache")
        data_file = open(cache_file_name, 'rb')

        #
        #   Is file version correct
        #
        latest_version, errmsg = check_cache_file_version(
            data_file, _FILE_VERSION_MAJ)
        if not latest_version:
            logger.warning(errmsg)
            return None

        #
        #   Ignore feature type data
        #
        cnt_protein_features = marshal.load(data_file)
        for i in range(cnt_protein_features):
            name = marshal.load(data_file)
            t_protein_feature.load(data_file)

        #
        #   Retrieve features by type
        #
        file_pos_by_section = read_directory_of_sections(data_file)
        return sorted(file_pos_by_section.keys(), key=str.lower)

    except:
        raise
        return None
def _load_from_cache (cache_file_name, logger, data_to_load = None):
    """
    Load tree data from cache file
        data_to_load is a list or set of the following values
                "alignment", "node", "node_to_parent" which determines what data to load
                usually just node or node_to_parent is useful.
    """
    start_time = time.time()
    try:
        if not os.path.exists(cache_file_name):
            return None, None, None

        logger.debug("  Loading protein tree from cache")
        logger.info("  Loading cache file = %s" % cache_file_name)
        data_file = open(cache_file_name, 'rb')

        #
        #   Is file version correct
        #
        latest_version, errmsg = check_cache_file_version (data_file, _FILE_VERSION_MAJ)
        if not latest_version:
            logger.warning(errmsg)
            return None, None, None
        #
        #   Retrieve protein tree data in sections
        #
        file_pos_by_section = read_directory_of_sections (data_file)

        #
        #   Read all data if none specified
        #
        if data_to_load == None:
            data_to_load = file_pos_by_section.keys()

        load_method = dict()
        load_method["alignment"]       = load_dict_of_objects, t_prot_tree_alignment
        load_method["node"]            = load_dict_of_objects, t_prot_tree_node
        load_method["node_to_parent"]  = (lambda d, ignore: marshal.load(d)), dict

        data = dict()
        for section in file_pos_by_section:

            #
            #   initialise to empty data
            #
            data[section] = dict()

            if section not in data_to_load:
                continue

            #
            #   read data
            #
            data_file.seek(file_pos_by_section[section], os.SEEK_SET)
            load_func, object_type = load_method[section]

            data[section] = load_func(data_file, object_type)



        #   log what data have been read?
        #
        end_time = time.time()
        logger.info("  Loaded tree data in %ds" % (end_time - start_time))

        # prot_node_id_to_alignments, prot_node_id_to_nodes, prot_node_id_to_parent
        return [data[k] for k in sorted(data.keys())]
    except:
        raise
        return None, None, None
def _load_features_from_cache(cache_file_name,
                              logger,
                              filter_feature_types=None):
    """
    Load features from cache file
    """
    start_time = time.time()
    try:
        if not os.path.exists(cache_file_name):
            logger.warning("The cache file %s does not exist" %
                           cache_file_name)
            return None, None

        logger.debug("  Loading features from cache")
        logger.info("  Loading cache file = %s" % cache_file_name)
        data_file = open(cache_file_name, 'rb')

        #
        #   Is file version correct
        #
        latest_version, errmsg = check_cache_file_version(
            data_file, _FILE_VERSION_MAJ)
        if not latest_version:
            logger.warning(errmsg)
            return None, None

        #
        #   Retrieve feature type data
        #
        protein_features = dict()
        cnt_protein_features = marshal.load(data_file)
        for i in range(cnt_protein_features):
            name = marshal.load(data_file)
            protein_features[name] = t_protein_feature.load(data_file)

        #
        #   Retrieve features by type
        #
        file_pos_by_section = read_directory_of_sections(data_file)

        #
        #   Read all features if none specified
        #
        if filter_feature_types == None:
            filter_feature_types = file_pos_by_section.keys()

        matches_by_type = defaultdict(lambda: defaultdict(list))
        cnt_all_matches = 0

        #
        #   have case-independent matches
        #
        filter_feature_types_lc = map(str.lower, filter_feature_types)
        feature_type_lc_to_orig = dict(
            zip(map(str.lower, file_pos_by_section.keys()),
                file_pos_by_section.keys()))

        for feature_type_lc in filter_feature_types_lc:
            if feature_type_lc not in feature_type_lc_to_orig:
                continue
            feature_type = feature_type_lc_to_orig[feature_type_lc]

            data_file.seek(file_pos_by_section[feature_type], os.SEEK_SET)

            #
            #   read feature
            #
            #
            cnt_prot_ids = marshal.load(data_file)
            for i in xrange(cnt_prot_ids):
                prot_id = marshal.load(data_file)
                cnt_matches = marshal.load(data_file)
                for i in range(cnt_matches):
                    matches_by_type[feature_type][prot_id].append(
                        t_protein_feature_match.load(data_file))

            cnt_all_matches += cnt_matches

        #
        #   log what features have been read?
        #
        end_time = time.time()
        logger.info("  Loaded %d features in %ds" %
                    (cnt_all_matches, end_time - start_time))

        return protein_features, matches_by_type
    except:
        raise
        return None, None