from scipy.sparse import csr_matrix from sklearn.base import TransformerMixin from scipy.stats import norm from logging import getLogger, StreamHandler from numpy import ndarray, memmap from typing import Union from DocumentFeatureSelection import init_logger import numpy as np import joblib import logging logger = getLogger(init_logger.LOGGER_NAME) logger = init_logger.init_logger(logger) def bns(X: Union[memmap, csr_matrix], feature_index: int, sample_index: int, unit_distribution: np.ndarray, true_index: int = 0, verbose: bool = False): if true_index == 0: false_index = 1 elif true_index == 1: false_index = 0 else: raise Exception('true index must be either of 0 or 1') # trueラベルで出現した回数 # tp is frequency of features in the specified positive label tp = X[true_index, feature_index]
from collections import Counter from DocumentFeatureSelection.models import SetDocumentInformation, AvailableInputTypes, PersistentDict from DocumentFeatureSelection import init_logger from DocumentFeatureSelection.common.utils import init_cache_object from sklearn.feature_extraction import DictVectorizer from typing import Dict, List, Tuple, Any, Union from sqlitedict import SqliteDict import logging import joblib import itertools import tempfile logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) N_FEATURE_SWITCH_STRATEGY = 1000000 ''' def decode_into_utf8(string:str)->bytes: """* what you can do - convert string into etf-8 """ return string.encode('utf-8')''' def generate_document_dict(document_key:str, documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Counter]: """This function gets Document-frequency count in given list of documents """ assert isinstance(documents, list) word_frequencies = [Counter(document) for document in documents] document_frequencies = Counter() for word_frequency in word_frequencies: document_frequencies.update(list(word_frequency.keys())) return (document_key, document_frequencies)
from collections import Counter from DocumentFeatureSelection.models import SetDocumentInformation, AvailableInputTypes from DocumentFeatureSelection import init_logger from sklearn.feature_extraction import DictVectorizer from typing import Dict, List, Tuple, Any, Union from sqlitedict import SqliteDict import logging import joblib import itertools logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) N_FEATURE_SWITCH_STRATEGY = 1000000 def decode_into_utf8(string:str)->bytes: """* what you can do - convert string into etf-8 """ return string.encode('utf-8') def generate_document_dict(document_key:str, documents:List[Union[List[str], Tuple[Any]]])->Tuple[str,Counter]: """This function gets Document-frequency count in given list of documents """ assert isinstance(documents, list) word_frequencies = [Counter(document) for document in documents] document_frequencies = Counter() for word_frequency in word_frequencies: document_frequencies.update(word_frequency.keys()) return (document_key, document_frequencies)