Beispiel #1
0
def preprocess_document(documents):
    preprocessed_documents = []
    preprocessed_documents_tokens = []

    for document in documents:
        re_document = build_source_file.reform_text(document)
        re_token, re_document = utility_function.tokenize(re_document,
                                                          rm_stop=True)
        preprocessed_documents.append(re_document)
        preprocessed_documents_tokens.append(re_token)

    preprocessed_documents_tokens = multsum_preprocess.preprocess(
        preprocessed_documents_tokens)

    return preprocessed_documents, preprocessed_documents_tokens
Beispiel #2
0
def summarize_documents(documents, stopwordsFilename=DEFAULT_STOPWORDS, length=DEFAULT_SUMMARY_LENGTH, unit=UNIT_WORDS, use_tfidf_similarity=True, use_sentiment_similarity=True, use_w2v_similarity=True, w2v_vector_file=W2V_VECTOR_FILE, preloaded_w2v_wordmodel=None, w2v_backend=False, w2v_experiments="", quiet=False, output_numbers=False, use_aggregate_for_clustering=False, anaphora_resolution_simple=False, min_sentence_length=MIN_SENTENCE_LENGTH):

  # print 'summarize_documents()'
  #for l in documents:
  #  for s in l:
  #    print s
  documents = multsum_preprocess.preprocess(documents, anaphora_resolution_simple=anaphora_resolution_simple, quiet=quiet)

  sentsims = get_def_sentsims(documents, stopwordsFilename, None)

  matrices = list()
  flat_sentences = [sentence for document in documents for sentence in document]
  #for sentence in flat_sentences:
  #  print sentence
  if use_sentiment_similarity:
    (pos, neg) = analyze_sentiment(flat_sentences)
    matrices.append(pos)
    matrices.append(neg)
  if use_w2v_similarity:
    if not quiet:
      print('Computing sentence similarities based on word2vec.')
    wordmodel = None
    if preloaded_w2v_wordmodel:
      wordmodel = preloaded_w2v_wordmodel
    elif not w2v_backend:
      wordmodel = load_w2v_wordmodel(w2v_vector_file)
    if wordmodel or w2v_backend:
      w2v_matrix = get_w2v_matrix(flat_sentences, wordmodel, w2v_backend, get_stopwords(stopwordsFilename), documents, w2v_experiments, quiet=quiet)
      if not w2v_matrix is None:
        matrices.append(w2v_matrix)
  if use_tfidf_similarity or len(matrices) == 0:
    # this is also used for fallback if the others were specified and failed for some reason.
    matrices.append(sentsims["tfidf_cosine"])

  if not quiet:
    print 'Input sentences:'
    for l in documents:
      for s in l:
        print '  '+' '.join(s)

  #for m in matrices:
  #  for i in range(0,m.shape[0]):
  #    for j in range(0,m.shape[1]):
  #      print str(m[i][j])+' ',
  #    print 'EOL'

  summary_set = select_sentences(length,
                     matrices,
                     sentsims["idf_vectors"],
                     documents,
                     unit,
                     None,
                     'summarization_doc',
                     use_aggregate_for_clustering=use_aggregate_for_clustering,
                     clustering_matrix=None)
  summary_list = list(summary_set)
  summary_list.sort()
  return_string = ''
  #if not quiet:
  #  print 'Summary:'
  for i in summary_list:
    if output_numbers:
      #print "outputting numbers: %d"%(i+1)
      return_string += "%d\n"%(i+1)
    else:
      return_string += ' '.join(get_sentence_index(i, documents))+'\n'
    #if not quiet:
    #  print('  '+get_sentence_index(i, documents))
  return return_string
def summarize_strings(sentences_lists,
                      stopwordsFilename=DEFAULT_STOPWORDS,
                      length=DEFAULT_SUMMARY_LENGTH,
                      unit=UNIT_WORDS,
                      use_tfidf_similarity=True,
                      use_sentiment_similarity=True,
                      use_w2v_similarity=True,
                      w2v_vector_file=W2V_VECTOR_FILE,
                      split_sentences=False,
                      preloaded_w2v_wordmodel=None,
                      w2v_backend=False,
                      w2v_experiments="",
                      quiet=False,
                      output_numbers=False,
                      use_aggregate_for_clustering=False,
                      anaphora_resolution_simple=False,
                      min_sentence_length=MIN_SENTENCE_LENGTH):

    # print 'summarize_strings()'
    #for l in sentences_lists:
    #  for s in l:
    #    print s

    sentence_count = 0
    if split_sentences:
        if not quiet:
            print 'splitting'
        splittedLists = []
        for l in sentences_lists:
            splittedList = []
            documentstring = ""
            for s in l:
                documentstring += " \n" + s
            #splitted = re.split('[\.!?]', s)
            splitted = re.split('(?<=[\.!\?])\W+', documentstring)
            for s in splitted:
                stripped = s.strip()
                if stripped and stripped.count(' ') + 1 > min_sentence_length:
                    sentence_count += 1
                    splittedList.append(s.replace('\n', ' '))
                    #print s
            splittedLists.append(splittedList)
        sentences_lists = splittedLists
    elif min_sentence_length > 0:
        new_sentences_lists = list()
        for l in sentences_lists:
            new_l = list()
            for s in l:
                if s.strip().count(' ') + 1 <= min_sentence_length:
                    sentence_count += 1
                    new_l.append(s)
            new_sentences_lists.append(new_l)
        sentences_lists = new_sentences_lists

    if not quiet:
        print "Total sentence count after min length filtering and (possibly) splitting: %d" % (
            sentence_count)

    sentences_lists = multsum_preprocess.preprocess(
        sentences_lists, anaphora_resolution_simple=anaphora_resolution_simple)

    sentsims = get_def_sentsims(sentences_lists, stopwordsFilename, None)

    matrices = list()
    flat_sentences = [
        sentence for document in sentences_lists for sentence in document
    ]
    #for sentence in flat_sentences:
    #  print sentence
    if use_sentiment_similarity:
        (pos, neg) = analyze_sentiment(flat_sentences)
        matrices.append(pos)
        matrices.append(neg)
    if use_w2v_similarity:
        if not quiet:
            print('Computing sentence similarities based on word2vec.')
        wordmodel = None
        if preloaded_w2v_wordmodel:
            wordmodel = preloaded_w2v_wordmodel
        elif not w2v_backend:
            wordmodel = load_w2v_wordmodel(w2v_vector_file)
        if wordmodel or w2v_backend:
            w2v_matrix = get_w2v_matrix(flat_sentences,
                                        wordmodel,
                                        w2v_backend,
                                        get_stopwords(stopwordsFilename),
                                        sentences_lists,
                                        w2v_experiments,
                                        quiet=quiet)
            if not w2v_matrix is None:
                matrices.append(w2v_matrix)
    if use_tfidf_similarity or len(matrices) == 0:
        # this is also used for fallback if the others were specified and failed for some reason.
        matrices.append(sentsims["tfidf_cosine"])

    if not quiet:
        print 'Input sentences:'
        for l in sentences_lists:
            for s in l:
                print '  ' + s

    #for m in matrices:
    #  for i in range(0,m.shape[0]):
    #    for j in range(0,m.shape[1]):
    #      print str(m[i][j])+' ',
    #    print 'EOL'

    summary_set = select_sentences(
        length,
        matrices,
        sentsims["idf_vectors"],
        sentences_lists,
        unit,
        None,
        'summarization_doc',
        use_aggregate_for_clustering=use_aggregate_for_clustering)
    summary_list = list(summary_set)
    summary_list.sort()
    return_string = ''
    #if not quiet:
    #  print 'Summary:'
    for i in summary_list:
        if output_numbers:
            #print "outputting numbers: %d"%(i+1)
            return_string += "%d\n" % (i + 1)
        else:
            return_string += get_sentence_index(i, sentences_lists) + '\n'
        #if not quiet:
        #  print('  '+get_sentence_index(i, sentences_lists))
    return return_string
Beispiel #4
0
def summarize_documents(documents, stopwordsFilename=DEFAULT_STOPWORDS, length=DEFAULT_SUMMARY_LENGTH, unit=UNIT_WORDS, use_tfidf_similarity=True, use_sentiment_similarity=True, use_w2v_similarity=True, w2v_vector_file=W2V_VECTOR_FILE, preloaded_w2v_wordmodel=None, w2v_backend=False, w2v_experiments="", quiet=False, output_numbers=False, use_aggregate_for_clustering=False, anaphora_resolution_simple=False, min_sentence_length=MIN_SENTENCE_LENGTH):

  # print 'summarize_documents()'
  #for l in documents:
  #  for s in l:
  #    print s
  documents = multsum_preprocess.preprocess(documents, anaphora_resolution_simple=anaphora_resolution_simple, quiet=quiet)
  
  sentsims = get_def_sentsims(documents, stopwordsFilename, None)

  matrices = list()
  flat_sentences = [sentence for document in documents for sentence in document]
  #for sentence in flat_sentences:
  #  print sentence
  if use_sentiment_similarity:
    (pos, neg) = analyze_sentiment(flat_sentences)
    matrices.append(pos)
    matrices.append(neg)
  if use_w2v_similarity:
    if not quiet:
      print('Computing sentence similarities based on word2vec.')
    wordmodel = None
    if preloaded_w2v_wordmodel:
      wordmodel = preloaded_w2v_wordmodel
    elif not w2v_backend:
      wordmodel = load_w2v_wordmodel(w2v_vector_file)
    if wordmodel or w2v_backend:
      w2v_matrix = get_w2v_matrix(flat_sentences, wordmodel, w2v_backend, get_stopwords(stopwordsFilename), documents, w2v_experiments, quiet=quiet)
      if not w2v_matrix is None:
        matrices.append(w2v_matrix)
  if use_tfidf_similarity or len(matrices) == 0:
    # this is also used for fallback if the others were specified and failed for some reason.
    matrices.append(sentsims["tfidf_cosine"])

  if not quiet:
    print 'Input sentences:'
    for l in documents:
      for s in l:
        print '  '+' '.join(s)

  #for m in matrices:
  #  for i in range(0,m.shape[0]):
  #    for j in range(0,m.shape[1]):
  #      print str(m[i][j])+' ',
  #    print 'EOL'

  summary_set = select_sentences(length,
                     matrices,
                     sentsims["idf_vectors"],
                     documents,
                     unit,
                     None,
                     'summarization_doc',
                     use_aggregate_for_clustering=use_aggregate_for_clustering)
  summary_list = list(summary_set)
  summary_list.sort()
  return_string = ''
  #if not quiet:
  #  print 'Summary:'
  for i in summary_list:
    if output_numbers:
      #print "outputting numbers: %d"%(i+1)
      return_string += "%d\n"%(i+1)
    else:
      return_string += ' '.join(get_sentence_index(i, documents))+'\n'
    #if not quiet:
    #  print('  '+get_sentence_index(i, documents))
  return return_string
def summarize_strings(sentences_lists, stopwordsFilename=DEFAULT_STOPWORDS, length=DEFAULT_SUMMARY_LENGTH, unit=UNIT_WORDS, use_tfidf_similarity=True, use_sentiment_similarity=True, use_w2v_similarity=True, w2v_vector_file=W2V_VECTOR_FILE, split_sentences=False, preloaded_w2v_wordmodel=None, w2v_backend=False, w2v_experiments="", quiet=False, output_numbers=False, use_aggregate_for_clustering=False, anaphora_resolution_simple=False, min_sentence_length=MIN_SENTENCE_LENGTH):

  # print 'summarize_strings()'
  #for l in sentences_lists:
  #  for s in l:
  #    print s

  sentence_count = 0
  if split_sentences:
    if not quiet:
      print 'splitting'
    splittedLists = []
    for l in sentences_lists:
      splittedList = []
      documentstring = ""
      for s in l:
        documentstring += " \n"+s
      #splitted = re.split('[\.!?]', s)
      splitted = re.split('(?<=[\.!\?])\W+', documentstring)
      for s in splitted:
        stripped = s.strip()
        if stripped and stripped.count(' ')+1 > min_sentence_length:
          sentence_count += 1
          splittedList.append(s.replace('\n', ' '))
          #print s
      splittedLists.append(splittedList)
    sentences_lists = splittedLists
  elif min_sentence_length > 0:
    new_sentences_lists = list()
    for l in sentences_lists:
      new_l = list()
      for s in l:
        if s.strip().count(' ')+1 <= min_sentence_length:
          sentence_count += 1
          new_l.append(s)
      new_sentences_lists.append(new_l)
    sentences_lists = new_sentences_lists

  if not quiet:
    print "Total sentence count after min length filtering and (possibly) splitting: %d"%(sentence_count)

  sentences_lists = multsum_preprocess.preprocess(sentences_lists, anaphora_resolution_simple=anaphora_resolution_simple)
  
  sentsims = get_def_sentsims(sentences_lists, stopwordsFilename, None)

  matrices = list()
  flat_sentences = [sentence for document in sentences_lists for sentence in document]
  #for sentence in flat_sentences:
  #  print sentence
  if use_sentiment_similarity:
    (pos, neg) = analyze_sentiment(flat_sentences)
    matrices.append(pos)
    matrices.append(neg)
  if use_w2v_similarity:
    if not quiet:
      print('Computing sentence similarities based on word2vec.')
    wordmodel = None
    if preloaded_w2v_wordmodel:
      wordmodel = preloaded_w2v_wordmodel
    elif not w2v_backend:
      wordmodel = load_w2v_wordmodel(w2v_vector_file)
    if wordmodel or w2v_backend:
      w2v_matrix = get_w2v_matrix(flat_sentences, wordmodel, w2v_backend, get_stopwords(stopwordsFilename), sentences_lists, w2v_experiments, quiet=quiet)
      if not w2v_matrix is None:
        matrices.append(w2v_matrix)
  if use_tfidf_similarity or len(matrices) == 0:
    # this is also used for fallback if the others were specified and failed for some reason.
    matrices.append(sentsims["tfidf_cosine"])

  if not quiet:
    print 'Input sentences:'
    for l in sentences_lists:
      for s in l:
        print '  '+s

  #for m in matrices:
  #  for i in range(0,m.shape[0]):
  #    for j in range(0,m.shape[1]):
  #      print str(m[i][j])+' ',
  #    print 'EOL'

  summary_set = select_sentences(length,
                     matrices,
                     sentsims["idf_vectors"],
                     sentences_lists,
                     unit,
                     None,
                     'summarization_doc',
                     use_aggregate_for_clustering=use_aggregate_for_clustering)
  summary_list = list(summary_set)
  summary_list.sort()
  return_string = ''
  #if not quiet:
  #  print 'Summary:'
  for i in summary_list:
    if output_numbers:
      #print "outputting numbers: %d"%(i+1)
      return_string += "%d\n"%(i+1)
    else:
      return_string += get_sentence_index(i, sentences_lists)+'\n'
    #if not quiet:
    #  print('  '+get_sentence_index(i, sentences_lists))
  return return_string