def multiDocs2TermFreqInfo(labeled_documents:AvailableInputTypes): """This function generates information to construct term-frequency matrix """ assert isinstance(labeled_documents, (SqliteDict, dict)) counted_frequency = [(label, Counter(list(itertools.chain.from_iterable(documents)))) for label, documents in labeled_documents.items()] feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency] # use sklearn feature-extraction vec = DictVectorizer() matrix_object = vec.fit_transform(feature_documents).tocsr() feature2id = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())} label2id = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency)} return SetDocumentInformation(matrix_object, label2id, feature2id)
def multiDocs2DocFreqInfo(labeled_documents:AvailableInputTypes, n_jobs:int=1)->SetDocumentInformation: """This function generates information for constructing document-frequency matrix. """ assert isinstance(labeled_documents, (SqliteDict, dict)) type_flag = set([judge_feature_type(docs) for docs in labeled_documents.values()]) assert len(type_flag)==1 counted_frequency = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(generate_document_dict)(key, docs) for key, docs in sorted(labeled_documents.items(), key=lambda key_value_tuple: key_value_tuple[0])) feature_documents = [dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency] # use sklearn feature-extraction vec = DictVectorizer() matrix_object = vec.fit_transform(feature_documents).tocsr() feature2id = {feat:feat_id for feat_id, feat in enumerate(vec.get_feature_names())} label2id = {label_freqCounter_tuple[0]:label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency)} return SetDocumentInformation(matrix_object, label2id, feature2id)
def count_document_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int])->numpy.ndarray: """This method count n(docs) per label. """ assert isinstance(labeled_documents, (SqliteDict, dict)) assert isinstance(label2id, dict) # count n(docs) per label n_doc_distribution = { label: len(document_lists) for label, document_lists in labeled_documents.items() } # make list of distribution n_doc_distribution_list = [0] * len(labeled_documents) for label_string, n_doc in n_doc_distribution.items(): #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] docs_index = label2id[label_string] n_doc_distribution_list[docs_index] = n_doc return numpy.array(n_doc_distribution_list, dtype='i8')
def count_term_frequency_distribution(self, labeled_documents:AvailableInputTypes, label2id:Dict[str,int]): """Count term-distribution per label. """ assert isinstance(labeled_documents, (SqliteDict, dict)) assert isinstance(label2id, dict) # count total term-frequency per label term_frequency_distribution = { label: len(list(utils.flatten(document_lists))) for label, document_lists in labeled_documents.items() } # make list of distribution term_frequency_distribution_list = [0] * len(labeled_documents) for label_string, n_doc in term_frequency_distribution.items(): #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] term_index = label2id[label_string] term_frequency_distribution_list[term_index] = n_doc return numpy.array(term_frequency_distribution_list, dtype='i8')
def count_term_frequency_distribution( self, labeled_documents: AvailableInputTypes, label2id: Dict[str, int]): """Count term-distribution per label. """ assert isinstance(labeled_documents, (SqliteDict, dict)) assert isinstance(label2id, dict) # count total term-frequency per label term_frequency_distribution = { label: len(list(utils.flatten(document_lists))) for label, document_lists in list(labeled_documents.items()) } # make list of distribution term_frequency_distribution_list = [0] * len(labeled_documents) for label_string, n_doc in list(term_frequency_distribution.items()): #term_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] term_index = label2id[label_string] term_frequency_distribution_list[term_index] = n_doc return numpy.array(term_frequency_distribution_list, dtype='i8')
def count_document_distribution(self, labeled_documents: AvailableInputTypes, label2id: Dict[str, int]) -> numpy.ndarray: """This method count n(docs) per label. """ assert isinstance(labeled_documents, (SqliteDict, dict)) assert isinstance(label2id, dict) # count n(docs) per label n_doc_distribution = { label: len(document_lists) for label, document_lists in labeled_documents.items() } # make list of distribution n_doc_distribution_list = [0] * len(labeled_documents) for label_string, n_doc in n_doc_distribution.items(): #docs_index = label2id[numpy.where(label2id['key'] == label_string.encode('utf-8'))][0]['value'] docs_index = label2id[label_string] n_doc_distribution_list[docs_index] = n_doc return numpy.array(n_doc_distribution_list, dtype='i8')
def make_multi_docs2term_freq_info(labeled_documents: AvailableInputTypes, is_use_cache: bool = True, path_work_dir: str = tempfile.mkdtemp()): """* What u can do - This function generates information to construct term-frequency matrix """ assert isinstance(labeled_documents, (SqliteDict, dict)) counted_frequency = [ (label, Counter(list(itertools.chain.from_iterable(documents)))) for label, documents in labeled_documents.items() ] feature_documents = [ dict(label_freqCounter_tuple[1]) for label_freqCounter_tuple in counted_frequency ] if is_use_cache: dict_matrix_index = init_cache_object('matrix_element_objects', path_work_dir=path_work_dir) else: dict_matrix_index = {} # use sklearn feature-extraction vec = DictVectorizer() dict_matrix_index['matrix_object'] = vec.fit_transform( feature_documents).tocsr() dict_matrix_index['feature2id'] = { feat: feat_id for feat_id, feat in enumerate(vec.get_feature_names()) } dict_matrix_index['label2id'] = { label_freqCounter_tuple[0]: label_id for label_id, label_freqCounter_tuple in enumerate(counted_frequency) } return SetDocumentInformation(dict_matrix_index)