def __init__(self):
     self.corrector = spell.probability()
     self.preprocessing = preprocessing.preprocessing()
     self.normalizing = normalize.normalizer(self.corrector)
     self.stemming = stem.sastrawi()
     self.stopword = open('modified stopword list.txt',
                          'r').read().split('\n')
     self.SUP = str.maketrans("⁰¹²³⁴⁵⁶⁷⁸⁹",
                              "0123456789")  #convert superscript to script
     self.punctuation = string.punctuation
Esempio n. 2
0
def _base_summarizer(
    corpus,
    decomposition,
    top_k: int = 3,
    max_df: float = 0.95,
    min_df: int = 2,
    ngram: Tuple[int, int] = (1, 3),
    vectorizer: str = 'bow',
    important_words: int = 10,
    retry: int = 5,
    **kwargs,
):

    vectorizer = vectorizer.lower()
    if not vectorizer in ['tfidf', 'bow', 'skip-gram']:
        raise ValueError(
            "vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")

    if min_df < 1:
        raise ValueError('min_df must be bigger than 0')
    if not (max_df <= 1 and max_df > 0):
        raise ValueError(
            'max_df must be bigger than 0, less than or equal to 1')
    if not isinstance(corpus, list) and not isinstance(corpus, str):
        raise ValueError('corpus must be a list')
    if isinstance(corpus, list):
        if not isinstance(corpus[0], str):
            raise ValueError('corpus must be list of strings')
    if isinstance(corpus, str):
        corpus = split_into_sentences(corpus)
    else:
        corpus = '. '.join(corpus)
        corpus = split_into_sentences(corpus)

    splitted_fullstop = [summary_textcleaning(i) for i in corpus]
    original_strings = [i[0] for i in splitted_fullstop]
    cleaned_strings = [i[1] for i in splitted_fullstop]
    stemmed = [sastrawi(i) for i in cleaned_strings]

    if vectorizer == 'tfidf':
        Vectorizer = TfidfVectorizer
    elif vectorizer == 'bow':
        Vectorizer = CountVectorizer
    elif vectorizer == 'skip-gram':
        Vectorizer = SkipGramVectorizer
    else:
        raise Exception("vectorizer must be in  ['tfidf', 'bow', 'skip-gram']")
    tf_vectorizer = Vectorizer(
        max_df=max_df,
        min_df=min_df,
        ngram_range=ngram,
        stop_words=STOPWORDS,
        **kwargs,
    )
    tf = tf_vectorizer.fit_transform(stemmed)
    if hasattr(tf_vectorizer, 'idf_'):
        indices = np.argsort(tf_vectorizer.idf_)[::-1]
    else:
        indices = np.argsort(np.asarray(tf.sum(axis=0))[0])[::-1]

    features = tf_vectorizer.get_feature_names()
    top_words = [features[i] for i in indices[:important_words]]
    vectors = decomposition(tf.shape[1] // 2).fit_transform(tf)
    similar = cosine_similarity(vectors, vectors)
    similar[similar >= 0.999] = 0
    scores = pagerank(similar, retry)
    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(original_strings)), reverse=True)
    summary = [r[1] for r in ranked_sentences[:top_k]]
    return {
        'summary': ' '.join(summary),
        'top-words': top_words,
        'cluster-top-words': cluster_words(top_words),
    }