def preprocess_document(documents): preprocessed_documents = [] preprocessed_documents_tokens = [] for document in documents: re_document = build_source_file.reform_text(document) re_token, re_document = utility_function.tokenize(re_document, rm_stop=True) preprocessed_documents.append(re_document) preprocessed_documents_tokens.append(re_token) preprocessed_documents_tokens = multsum_preprocess.preprocess( preprocessed_documents_tokens) return preprocessed_documents, preprocessed_documents_tokens
def summarize_documents(documents, stopwordsFilename=DEFAULT_STOPWORDS, length=DEFAULT_SUMMARY_LENGTH, unit=UNIT_WORDS, use_tfidf_similarity=True, use_sentiment_similarity=True, use_w2v_similarity=True, w2v_vector_file=W2V_VECTOR_FILE, preloaded_w2v_wordmodel=None, w2v_backend=False, w2v_experiments="", quiet=False, output_numbers=False, use_aggregate_for_clustering=False, anaphora_resolution_simple=False, min_sentence_length=MIN_SENTENCE_LENGTH): # print 'summarize_documents()' #for l in documents: # for s in l: # print s documents = multsum_preprocess.preprocess(documents, anaphora_resolution_simple=anaphora_resolution_simple, quiet=quiet) sentsims = get_def_sentsims(documents, stopwordsFilename, None) matrices = list() flat_sentences = [sentence for document in documents for sentence in document] #for sentence in flat_sentences: # print sentence if use_sentiment_similarity: (pos, neg) = analyze_sentiment(flat_sentences) matrices.append(pos) matrices.append(neg) if use_w2v_similarity: if not quiet: print('Computing sentence similarities based on word2vec.') wordmodel = None if preloaded_w2v_wordmodel: wordmodel = preloaded_w2v_wordmodel elif not w2v_backend: wordmodel = load_w2v_wordmodel(w2v_vector_file) if wordmodel or w2v_backend: w2v_matrix = get_w2v_matrix(flat_sentences, wordmodel, w2v_backend, get_stopwords(stopwordsFilename), documents, w2v_experiments, quiet=quiet) if not w2v_matrix is None: matrices.append(w2v_matrix) if use_tfidf_similarity or len(matrices) == 0: # this is also used for fallback if the others were specified and failed for some reason. matrices.append(sentsims["tfidf_cosine"]) if not quiet: print 'Input sentences:' for l in documents: for s in l: print ' '+' '.join(s) #for m in matrices: # for i in range(0,m.shape[0]): # for j in range(0,m.shape[1]): # print str(m[i][j])+' ', # print 'EOL' summary_set = select_sentences(length, matrices, sentsims["idf_vectors"], documents, unit, None, 'summarization_doc', use_aggregate_for_clustering=use_aggregate_for_clustering, clustering_matrix=None) summary_list = list(summary_set) summary_list.sort() return_string = '' #if not quiet: # print 'Summary:' for i in summary_list: if output_numbers: #print "outputting numbers: %d"%(i+1) return_string += "%d\n"%(i+1) else: return_string += ' '.join(get_sentence_index(i, documents))+'\n' #if not quiet: # print(' '+get_sentence_index(i, documents)) return return_string
def summarize_strings(sentences_lists, stopwordsFilename=DEFAULT_STOPWORDS, length=DEFAULT_SUMMARY_LENGTH, unit=UNIT_WORDS, use_tfidf_similarity=True, use_sentiment_similarity=True, use_w2v_similarity=True, w2v_vector_file=W2V_VECTOR_FILE, split_sentences=False, preloaded_w2v_wordmodel=None, w2v_backend=False, w2v_experiments="", quiet=False, output_numbers=False, use_aggregate_for_clustering=False, anaphora_resolution_simple=False, min_sentence_length=MIN_SENTENCE_LENGTH): # print 'summarize_strings()' #for l in sentences_lists: # for s in l: # print s sentence_count = 0 if split_sentences: if not quiet: print 'splitting' splittedLists = [] for l in sentences_lists: splittedList = [] documentstring = "" for s in l: documentstring += " \n" + s #splitted = re.split('[\.!?]', s) splitted = re.split('(?<=[\.!\?])\W+', documentstring) for s in splitted: stripped = s.strip() if stripped and stripped.count(' ') + 1 > min_sentence_length: sentence_count += 1 splittedList.append(s.replace('\n', ' ')) #print s splittedLists.append(splittedList) sentences_lists = splittedLists elif min_sentence_length > 0: new_sentences_lists = list() for l in sentences_lists: new_l = list() for s in l: if s.strip().count(' ') + 1 <= min_sentence_length: sentence_count += 1 new_l.append(s) new_sentences_lists.append(new_l) sentences_lists = new_sentences_lists if not quiet: print "Total sentence count after min length filtering and (possibly) splitting: %d" % ( sentence_count) sentences_lists = multsum_preprocess.preprocess( sentences_lists, anaphora_resolution_simple=anaphora_resolution_simple) sentsims = get_def_sentsims(sentences_lists, stopwordsFilename, None) matrices = list() flat_sentences = [ sentence for document in sentences_lists for sentence in document ] #for sentence in flat_sentences: # print sentence if use_sentiment_similarity: (pos, neg) = analyze_sentiment(flat_sentences) matrices.append(pos) matrices.append(neg) if use_w2v_similarity: if not quiet: print('Computing sentence similarities based on word2vec.') wordmodel = None if preloaded_w2v_wordmodel: wordmodel = preloaded_w2v_wordmodel elif not w2v_backend: wordmodel = load_w2v_wordmodel(w2v_vector_file) if wordmodel or w2v_backend: w2v_matrix = get_w2v_matrix(flat_sentences, wordmodel, w2v_backend, get_stopwords(stopwordsFilename), sentences_lists, w2v_experiments, quiet=quiet) if not w2v_matrix is None: matrices.append(w2v_matrix) if use_tfidf_similarity or len(matrices) == 0: # this is also used for fallback if the others were specified and failed for some reason. matrices.append(sentsims["tfidf_cosine"]) if not quiet: print 'Input sentences:' for l in sentences_lists: for s in l: print ' ' + s #for m in matrices: # for i in range(0,m.shape[0]): # for j in range(0,m.shape[1]): # print str(m[i][j])+' ', # print 'EOL' summary_set = select_sentences( length, matrices, sentsims["idf_vectors"], sentences_lists, unit, None, 'summarization_doc', use_aggregate_for_clustering=use_aggregate_for_clustering) summary_list = list(summary_set) summary_list.sort() return_string = '' #if not quiet: # print 'Summary:' for i in summary_list: if output_numbers: #print "outputting numbers: %d"%(i+1) return_string += "%d\n" % (i + 1) else: return_string += get_sentence_index(i, sentences_lists) + '\n' #if not quiet: # print(' '+get_sentence_index(i, sentences_lists)) return return_string
def summarize_documents(documents, stopwordsFilename=DEFAULT_STOPWORDS, length=DEFAULT_SUMMARY_LENGTH, unit=UNIT_WORDS, use_tfidf_similarity=True, use_sentiment_similarity=True, use_w2v_similarity=True, w2v_vector_file=W2V_VECTOR_FILE, preloaded_w2v_wordmodel=None, w2v_backend=False, w2v_experiments="", quiet=False, output_numbers=False, use_aggregate_for_clustering=False, anaphora_resolution_simple=False, min_sentence_length=MIN_SENTENCE_LENGTH): # print 'summarize_documents()' #for l in documents: # for s in l: # print s documents = multsum_preprocess.preprocess(documents, anaphora_resolution_simple=anaphora_resolution_simple, quiet=quiet) sentsims = get_def_sentsims(documents, stopwordsFilename, None) matrices = list() flat_sentences = [sentence for document in documents for sentence in document] #for sentence in flat_sentences: # print sentence if use_sentiment_similarity: (pos, neg) = analyze_sentiment(flat_sentences) matrices.append(pos) matrices.append(neg) if use_w2v_similarity: if not quiet: print('Computing sentence similarities based on word2vec.') wordmodel = None if preloaded_w2v_wordmodel: wordmodel = preloaded_w2v_wordmodel elif not w2v_backend: wordmodel = load_w2v_wordmodel(w2v_vector_file) if wordmodel or w2v_backend: w2v_matrix = get_w2v_matrix(flat_sentences, wordmodel, w2v_backend, get_stopwords(stopwordsFilename), documents, w2v_experiments, quiet=quiet) if not w2v_matrix is None: matrices.append(w2v_matrix) if use_tfidf_similarity or len(matrices) == 0: # this is also used for fallback if the others were specified and failed for some reason. matrices.append(sentsims["tfidf_cosine"]) if not quiet: print 'Input sentences:' for l in documents: for s in l: print ' '+' '.join(s) #for m in matrices: # for i in range(0,m.shape[0]): # for j in range(0,m.shape[1]): # print str(m[i][j])+' ', # print 'EOL' summary_set = select_sentences(length, matrices, sentsims["idf_vectors"], documents, unit, None, 'summarization_doc', use_aggregate_for_clustering=use_aggregate_for_clustering) summary_list = list(summary_set) summary_list.sort() return_string = '' #if not quiet: # print 'Summary:' for i in summary_list: if output_numbers: #print "outputting numbers: %d"%(i+1) return_string += "%d\n"%(i+1) else: return_string += ' '.join(get_sentence_index(i, documents))+'\n' #if not quiet: # print(' '+get_sentence_index(i, documents)) return return_string
def summarize_strings(sentences_lists, stopwordsFilename=DEFAULT_STOPWORDS, length=DEFAULT_SUMMARY_LENGTH, unit=UNIT_WORDS, use_tfidf_similarity=True, use_sentiment_similarity=True, use_w2v_similarity=True, w2v_vector_file=W2V_VECTOR_FILE, split_sentences=False, preloaded_w2v_wordmodel=None, w2v_backend=False, w2v_experiments="", quiet=False, output_numbers=False, use_aggregate_for_clustering=False, anaphora_resolution_simple=False, min_sentence_length=MIN_SENTENCE_LENGTH): # print 'summarize_strings()' #for l in sentences_lists: # for s in l: # print s sentence_count = 0 if split_sentences: if not quiet: print 'splitting' splittedLists = [] for l in sentences_lists: splittedList = [] documentstring = "" for s in l: documentstring += " \n"+s #splitted = re.split('[\.!?]', s) splitted = re.split('(?<=[\.!\?])\W+', documentstring) for s in splitted: stripped = s.strip() if stripped and stripped.count(' ')+1 > min_sentence_length: sentence_count += 1 splittedList.append(s.replace('\n', ' ')) #print s splittedLists.append(splittedList) sentences_lists = splittedLists elif min_sentence_length > 0: new_sentences_lists = list() for l in sentences_lists: new_l = list() for s in l: if s.strip().count(' ')+1 <= min_sentence_length: sentence_count += 1 new_l.append(s) new_sentences_lists.append(new_l) sentences_lists = new_sentences_lists if not quiet: print "Total sentence count after min length filtering and (possibly) splitting: %d"%(sentence_count) sentences_lists = multsum_preprocess.preprocess(sentences_lists, anaphora_resolution_simple=anaphora_resolution_simple) sentsims = get_def_sentsims(sentences_lists, stopwordsFilename, None) matrices = list() flat_sentences = [sentence for document in sentences_lists for sentence in document] #for sentence in flat_sentences: # print sentence if use_sentiment_similarity: (pos, neg) = analyze_sentiment(flat_sentences) matrices.append(pos) matrices.append(neg) if use_w2v_similarity: if not quiet: print('Computing sentence similarities based on word2vec.') wordmodel = None if preloaded_w2v_wordmodel: wordmodel = preloaded_w2v_wordmodel elif not w2v_backend: wordmodel = load_w2v_wordmodel(w2v_vector_file) if wordmodel or w2v_backend: w2v_matrix = get_w2v_matrix(flat_sentences, wordmodel, w2v_backend, get_stopwords(stopwordsFilename), sentences_lists, w2v_experiments, quiet=quiet) if not w2v_matrix is None: matrices.append(w2v_matrix) if use_tfidf_similarity or len(matrices) == 0: # this is also used for fallback if the others were specified and failed for some reason. matrices.append(sentsims["tfidf_cosine"]) if not quiet: print 'Input sentences:' for l in sentences_lists: for s in l: print ' '+s #for m in matrices: # for i in range(0,m.shape[0]): # for j in range(0,m.shape[1]): # print str(m[i][j])+' ', # print 'EOL' summary_set = select_sentences(length, matrices, sentsims["idf_vectors"], sentences_lists, unit, None, 'summarization_doc', use_aggregate_for_clustering=use_aggregate_for_clustering) summary_list = list(summary_set) summary_list.sort() return_string = '' #if not quiet: # print 'Summary:' for i in summary_list: if output_numbers: #print "outputting numbers: %d"%(i+1) return_string += "%d\n"%(i+1) else: return_string += get_sentence_index(i, sentences_lists)+'\n' #if not quiet: # print(' '+get_sentence_index(i, sentences_lists)) return return_string