def __init__(self): self.corrector = spell.probability() self.preprocessing = preprocessing.preprocessing() self.normalizing = normalize.normalizer(self.corrector) self.stemming = stem.sastrawi() self.stopword = open('modified stopword list.txt', 'r').read().split('\n') self.SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹", "0123456789") #convert superscript to script self.punctuation = string.punctuation
def _base_summarizer( corpus, decomposition, top_k: int = 3, max_df: float = 0.95, min_df: int = 2, ngram: Tuple[int, int] = (1, 3), vectorizer: str = 'bow', important_words: int = 10, retry: int = 5, **kwargs, ): vectorizer = vectorizer.lower() if not vectorizer in ['tfidf', 'bow', 'skip-gram']: raise ValueError( "vectorizer must be in ['tfidf', 'bow', 'skip-gram']") if min_df < 1: raise ValueError('min_df must be bigger than 0') if not (max_df <= 1 and max_df > 0): raise ValueError( 'max_df must be bigger than 0, less than or equal to 1') if not isinstance(corpus, list) and not isinstance(corpus, str): raise ValueError('corpus must be a list') if isinstance(corpus, list): if not isinstance(corpus[0], str): raise ValueError('corpus must be list of strings') if isinstance(corpus, str): corpus = split_into_sentences(corpus) else: corpus = '. '.join(corpus) corpus = split_into_sentences(corpus) splitted_fullstop = [summary_textcleaning(i) for i in corpus] original_strings = [i[0] for i in splitted_fullstop] cleaned_strings = [i[1] for i in splitted_fullstop] stemmed = [sastrawi(i) for i in cleaned_strings] if vectorizer == 'tfidf': Vectorizer = TfidfVectorizer elif vectorizer == 'bow': Vectorizer = CountVectorizer elif vectorizer == 'skip-gram': Vectorizer = SkipGramVectorizer else: raise Exception("vectorizer must be in ['tfidf', 'bow', 'skip-gram']") tf_vectorizer = Vectorizer( max_df=max_df, min_df=min_df, ngram_range=ngram, stop_words=STOPWORDS, **kwargs, ) tf = tf_vectorizer.fit_transform(stemmed) if hasattr(tf_vectorizer, 'idf_'): indices = np.argsort(tf_vectorizer.idf_)[::-1] else: indices = np.argsort(np.asarray(tf.sum(axis=0))[0])[::-1] features = tf_vectorizer.get_feature_names() top_words = [features[i] for i in indices[:important_words]] vectors = decomposition(tf.shape[1] // 2).fit_transform(tf) similar = cosine_similarity(vectors, vectors) similar[similar >= 0.999] = 0 scores = pagerank(similar, retry) ranked_sentences = sorted( ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True) summary = [r[1] for r in ranked_sentences[:top_k]] return { 'summary': ' '.join(summary), 'top-words': top_words, 'cluster-top-words': cluster_words(top_words), }