Exemple #1
0
    def __call__(self, document, sentences_count, user_dict):
        self._ensure_dependecies_installed()
        self.nlp_doc = self.nlp(document)
        self.user_dict = user_dict
        logger.info("Created doc")

        dictionary = self._create_dictionary()
        # empty document
        if not dictionary:
            return ()
        matrix = self._create_matrix(dictionary)
        matrix = self._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

        ranks = iter(self._compute_ranks(sigma, v))
        sents = [s.text for s in self.nlp_doc.sents]
        logger.info("Sentences generated by spacy are %s, count %s", sents, len(sents))
        new_sents = self._get_best_sentences(sents, sentences_count * 2, lambda s: next(ranks))
        filt_sents = [sent for sent in new_sents if self.better_question(sent)]
        additional_sents = set(new_sents) - set(filt_sents)
        to_add = sentences_count - len(filt_sents)
        final_sents = filt_sents
        if to_add > 0:
            final_sents += sorted(list(additional_sents)[:to_add], key=lambda x: len(x), reverse=True)
        logger.info("Filtered sentences %s", filt_sents)
        logger.info("Final recommendations are %s", final_sents[:sentences_count])
        return final_sents
def lsa_summary(text, limit=1):
  
  [dictionary, proc_text, sentences] = save_word_dict(text)
  
  tf_matrix = create_tf_matrix(proc_text, dictionary)
  tf_matrix = normalize_tf_matrix(tf_matrix, 0.3)

  # decompose in U x S X V matrices using SVD
  [u, s, v] = singular_value_decomposition(tf_matrix, full_matrices=False)

  reduction_ratio = 1.0
  dimension = len(s)
  reduced_dimension = int(dimension * reduction_ratio)

  min_dimension = 5

  if(reduced_dimension < min_dimension):
    reduced_dimension = min_dimension

  s2 = numpy.array(s, copy=True)
  s2 = numpy.square(s2).tolist()

  for i in range(reduced_dimension, dimension):
    s2[i,:] *= 0.0

  # http://textmining.zcu.cz/publications/PhDThesis-Steinberger.pdf
  # see page 25 - Sk = sqrt(sum(v * sigma^2 ))
  ranks = numpy.sqrt(numpy.square(v.T*s2).sum(axis=1))
  ranked_sentences = sorted(range(len(ranks)),key=lambda x:ranks[x], reverse=True)
  
  result_summary = ''
  for i in range(0, limit):
    result_summary = result_summary + ' ' + sentences[ranked_sentences[i]]

  return result_summary
Exemple #3
0
    def __call__(self, document, sentences_count, user_dict):
        self._ensure_dependecies_installed()
        self.nlp_doc = self.nlp(document)
        self.user_dict = user_dict
        logger.info("Created doc")

        dictionary = self._create_dictionary()
        # empty document
        if not dictionary:
            return ()
        matrix = self._create_matrix(dictionary)
        matrix = self._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

        ranks = iter(self._compute_ranks(sigma, v))
        sents = [s.text for s in self.nlp_doc.sents]
        logger.info("Sentences generated by spacy are %s, count %s", sents,
                    len(sents))
        new_sents = self._get_best_sentences(sents, sentences_count * 2,
                                             lambda s: next(ranks))
        filt_sents = [sent for sent in new_sents if self.better_question(sent)]
        additional_sents = set(new_sents) - set(filt_sents)
        to_add = sentences_count - len(filt_sents)
        final_sents = filt_sents
        if to_add > 0:
            final_sents += sorted(list(additional_sents)[:to_add],
                                  key=lambda x: len(x),
                                  reverse=True)
        logger.info("Filtered sentences %s", filt_sents)
        logger.info("Final recommendations are %s",
                    final_sents[:sentences_count])
        return final_sents
 def __call__(self, document, sentences_count):
     dictionary = self.create_dictionary(document)
     matrix = self.create_matrix(document, dictionary)
     matrix = self.compute_term_frequency(matrix)
     #print("mat==============================================:\n",matrix)
     u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
     ranks = iter(self.compute_ranks(sigma, v))
     #print("ranks**************************************\n",next(ranks))
     return self.get_best_sentences(document.sentences, sentences_count,lambda s: next(ranks))
    def __call__(self, doc, sent_count):

        dictionary = self._create_dictionary(doc)
        matrix = self._create_matrix(doc, dictionary)
        matrix = self._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

        ranks = iter(self._compute_ranks(sigma, v))
        return self._get_best_sentences(doc.sentences, sent_count,
            lambda s: next(ranks))
    def svdBpmMatrix(self,plane): #Computes SVD of the bpm_matrix
        global nturns

        bpm_matrix = self.peak_peak(plane)
        n_bpms = shape(bpm_matrix)[1]
        print 'performing SVD'
        #----svd for matrix with bpms >10
        if n_bpms > 10:
            A = singular_value_decomposition(bpm_matrix, full_matrices=0)
        else:
            sys.exit('Exit, # of bpms < 10')
            
        return A
    def svdClean(self, plane):
        global nturns, tx, ty
        print 'removing noise floor',plane
        
        if plane == 'x':
            b = tx[turn:,:]  #truncate by the first 5 turns
            n_turns = shape(b)[0]

        elif plane == 'y':
            b = ty[turn:,:]  #truncate by the first 5 turns
            n_turns = shape(b)[0]
        else:
            print "no tbt data acquired"
        
        b_mean = mean(b)
        b = (b-b_mean)/sqrt(n_turns)
        n_bpms = shape(b)[1]
        #----svd for matrix with bpms >10

        if n_bpms > 10:
            A = singular_value_decomposition(b,full_matrices=0)
            #print "Singular values:",A[1]
        else:
            sys.exit('Exit, # of bpms < 10')
        
        
        #----SVD cut for noise floor
        if sing_val > n_bpms:
            svdcut = n_bpms
            print 'requested more singular values than available'
            print '# of sing_val used for', plane, '=', n_bpms
        else:
            svdcut = int(sing_val)
            print '# of sing_val used for', plane, '=', svdcut
        #print A[1][0]
	A[1][svdcut:] = 0.
       	#temp=matrixmultiply(identity(len(A[1]))*A[1], A[2])
	temp=matrixmultiply(diag(A[1]), A[2])
	b = matrixmultiply(A[0],temp) ### check
        b = (b *sqrt(n_turns))+b_mean
        #b = b*sqrt(n_turns)
        
        if plane == 'x':
            tx[turn:,:] = b
        elif plane == 'y':
            ty[turn:,:] = b
        else:
            print "no tbt data to analyze"
        nturns = shape(tx)[0]
Exemple #8
0
    def __call__(self, document, sentences_count):
        self._ensure_dependecies_installed()
        dictionary = self._create_dictionary(document)
        if not dictionary:
            return ()

        matrix = self._create_matrix(document, dictionary)

        matrix = self._compute_term_frequency(matrix)
        # print(matrix)
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

        ranks = iter(self._compute_ranks(sigma, v))
        return self._get_best_sentences(document.sentences, sentences_count,
            lambda s: next(ranks))
Exemple #9
0
    def __call__(self, document, sentences_count):
        self._ensure_dependecies_installed()

        dictionary = self._create_dictionary(document)
        # empty document
        if not dictionary:
            return ()

        matrix = self._create_matrix(document, dictionary)
        matrix = self._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

        ranks = iter(self._compute_ranks(sigma, v))
        return self._get_best_sentences(document.sentences, sentences_count,
            lambda s: next(ranks))
    def __call__(self, document, sentences_count):

        dictionary = self._create_dictionary(document)
        sentences = sent_tokenize(document)

        matrix = self._create_matrix(document, dictionary)
        matrix = self._compute_TfIdf(matrix)
        
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
        
        
        v = self._preprocess_matrix_V(v)
        ranks = iter(self._compute_ranks(v, sigma))
        
        return self._get_best_sentences(sentences, sentences_count,
            lambda s: next(ranks))
Exemple #11
0
    def __call__(self, document, sentences_count):

        dictionary = self._create_dictionary(document)
        
        if not dictionary:
            return ()

        sentences = []
        for i in range(0, len(document)):
            li = sent_tokenize(document[i])
            sentences.extend(li)
        # print(sentences)

        matrix = self._create_matrix(document, dictionary)
        matrix = self._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)

        ranks = iter(self._compute_ranks(sigma, v))
        return self._get_best_sentences(sentences, sentences_count,
            lambda s: next(ranks))
Exemple #12
0
def start_lsa(article_id, limit, text, reference_summary):
  if(text == None):
    text = "Thomas A. Anderson is a man living two lives. By day he is an " + \
      "average computer programmer and by night a hacker known as " + \
      "Neo. Neo has always questioned his reality, but the truth is " + \
      "far beyond his imagination. Neo finds himself targeted by the " + \
      "police when he is contacted by Morpheus, a legendary computer " + \
      "hacker branded a terrorist by the government. Morpheus awakens " + \
      "Neo to the real world, a ravaged wasteland where most of " + \
      "humanity have been captured by a race of machines that live " + \
      "off of the humans' body heat and electrochemical energy and " + \
      "who imprison their minds within an artificial reality known as " + \
      "the Matrix. As a rebel against the machines, Neo must return to " + \
      "the Matrix and confront the agents: super-powerful computer " + \
      "programs devoted to snuffing out Neo and the entire human " + \
      "rebellion."

  [dictionary, proc_text, sentences] = save_word_dict(text)
  

  tf_matrix = create_tf_matrix(proc_text, dictionary)
  tf_matrix = normalize_tf_matrix(tf_matrix, 0.3)

  # decompose in U x S X V matrices using SVD
  [u, s, v] = singular_value_decomposition(tf_matrix, full_matrices=False)

  reduction_ratio = 1.0
  dimension = s.shape[0]
  reduced_dimension = int(dimension * reduction_ratio)

  min_dimension = 1

  if(reduced_dimension < min_dimension):
    reduced_dimension = min_dimension

  s2 = numpy.array(s, copy=True)
  s2 = numpy.square(s2)

  if(reduced_dimension < dimension):
    for i in range(reduced_dimension, dimension):
      s2[i] = 0

  # http://textmining.zcu.cz/publications/PhDThesis-Steinberger.pdf
  # see page 25 - Sk = sqrt(sum(v * sigma^2 ))
  ranks = numpy.sqrt(numpy.square(v.T*s2).sum(axis=1))
  #print "ranks " , ranks
  ranked_sentences = sorted(range(len(ranks)),key=lambda x:ranks[x], reverse=True)
  #print "ranked_sentences ", ranked_sentences

  
  result_summary = ''
  for i in range(0, limit):
    result_summary = result_summary + ' ' + sentences[ranked_sentences[i]]

  system_summary = result_summary

  # if(reference_summary != None):
  #   try:
  #     reference_summary = summarize(text)
  #   except (ValueError, ZeroDivisionError):
  #     return -1

  # if(reference_summary == None or len(reference_summary) == 0 or len(reference_summary) > 140):
  #   return -1

  # write reference summary to file
  
  sys_dir = os.pardir + "/test-summarization/system/" + article_id + "_" + "system.txt"
  ref_dir = os.pardir + "/test-summarization/reference/" + article_id + "_" + "reference.txt"

  write_to_file(ref_dir, reference_summary)
  reference_summary_list.append([ref_dir])

  # write system summary to file
  write_to_file(sys_dir, system_summary)
  system_summary_list.append(sys_dir)
  # test_print(reference_summary, system_summary)
  return ranked_sentences
Exemple #13
0
def LSAPlus_SumPlus(doc):

    # SumPlus
    sumbasic_sents = []
    for text in doc:
        tsummarizer_w_stops = SumBasicSummarizer()
        tsummarizer_w_stops.stop_words = get_stop_words('english')
        parser = PlaintextParser.from_string(text, Tokenizer('english'))
        dictionary = tsummarizer_w_stops._compute_ratings(
            parser.document.sentences)
        sumbasic_sents_entries = []
        for sent in dictionary:
            sumbasic_sents_entries.append(sent)
        sumbasic_sents.append(sumbasic_sents_entries)

    #LSAPlus

    lsa_sents = []
    for text in doc:
        l2summarizer = LsaSummarizer()
        parser = PlaintextParser.from_string(text, Tokenizer('english'))
        dictionary = (l2summarizer._create_dictionary(parser.document))
        matrix = l2summarizer._create_matrix(parser.document, dictionary)
        matrix2 = l2summarizer._compute_term_frequency(matrix)
        u, sigma, v = singular_value_decomposition(matrix2,
                                                   full_matrices=False)
        v_sorted = sorted(abs(v[:, 0]), reverse=True)
        v_indices = []
        for i in v_sorted:
            v_indices.append(list(v_sorted).index(i))

        sents = np.array(list(parser.document.sentences))
        sents[np.array(v_indices)]
        lsa_sents_entries = list(sents)
        lsa_sents.append(lsa_sents_entries)

    # Combining SumPlus and LSAPlus
    import math
    num_sentences = len(sumbasic_sents)
    all_sents_removed_parent2 = []
    for entry in range(num_sentences):
        num_sents_to_remove = math.ceil(len(sumbasic_sents[entry]) / 2)
        sent_len = len(sumbasic_sents[entry])
        sb = sumbasic_sents[entry][sent_len - num_sents_to_remove:sent_len]
        lsa = lsa_sents[entry][sent_len - num_sents_to_remove:sent_len]

        # Checking if Sentences are ranked bad by BOTH LSAPlus and SumPlus
        sents_removed3 = []
        for sent in lsa:
            if (sent in sb):
                sents_removed3.append(sent)

        # Setences to be Trimmed Off
        all_sents_removed_parent2.append(sents_removed3)

    sents_to_keep_parent2 = []
    for i in range(len(doc)):
        parser = PlaintextParser.from_string(doc[i], Tokenizer('english'))
        sents = parser.document.sentences

        # Sentences not Trimmed Off
        sents_to_keep2 = [
            sentence for sentence in sents
            if sentence not in all_sents_removed_parent2[i]
        ]

        # Appending Trimmed Text for Each Entry
        sents_to_keep_parent2.append(sents_to_keep2)

    # Trimmed Text
    sentence_parent2 = []
    for text in sents_to_keep_parent2:
        sentence = ""
        for sent in text:
            sentence = sentence + " " + str(sent)
        sentence_parent2.append(sentence)

    return sentence_parent2
Exemple #14
0
    def run(self):
        self.signals.UpdateProgressBar.emit(0)

        # Загрузка текста
        self.text = TextData(self.filename)
        self.text.original_sentences = readSentencesFromInputText(self.filename, None)
        original_sentences = tuple(self.text.original_sentences)

        self.signals.UpdateProgressBar.emit(5)

        # Разделение текста на слова
        self.configurations["minimal_words_in_sentence"] = 4
        self.configurations['need_agresive_filtration'] = True
        self.text.tokenized_sentences = tokenizeSingleText(self.text, self.configurations)

        # Удаление стоп-слов
        self.configurations["minimal_word_size"] = 3
        self.text.no_stop_words_sentences = removeStopWordsFromSentences(self.text.tokenized_sentences, self.morph, self.configurations)

        if len(self.text.no_stop_words_sentences) > 0:

            np.set_printoptions(suppress=False)

            self.signals.UpdateProgressBar.emit(20)

            # Нормализация
            texts, log_string = normalizeTexts([self.text], self.morph)
            self.text = texts[0]

            # Приведение регистра
            texts, log_string = fixRegisterInTexts(texts, self.morph)
            self.text = texts[0]

            self.signals.UpdateProgressBar.emit(30)

            # Расчет частотной таблицы слов
            texts, log_string = calculateWordsFrequencyInTexts(texts)
            self.text = texts[0]

            self.signals.UpdateProgressBar.emit(40)

            matrix, all_word_keys = self.CreateLSAMatrixForSummarization(self.text)
            matrix = self._compute_term_frequency(matrix)

            self.signals.UpdateProgressBar.emit(50)

            u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
            u = u + np.abs(np.min(u))
            v = v + np.abs(np.min(v))
            u, sigma, v = self.cutSingularValue(u, sigma, v)

            self.signals.UpdateProgressBar.emit(70)

            if(self.calculation_method == AnnotationMakerCalculator.METHOD_BY_SENTENCE_VALUE):
                self.calculateBySentenceValues(v, self.result_sentence_count)

            if (self.calculation_method == AnnotationMakerCalculator.METHOD_BY_WORDS_SUM):
                self.calculateByWordsValues(all_word_keys, u, self.result_sentence_count)
            self.signals.PrintInfo.emit('\nУспешно завершено.')
        else:
            self.signals.PrintInfo.emit('\nНедостаточно входных данных и/или много неликвидных данных.')

        self.signals.UpdateProgressBar.emit(100)

        self.signals.Finished.emit()
Exemple #15
0
def start_lsa(article_id, limit, text, reference_summary):
    if (text == None):
        text = "Thomas A. Anderson is a man living two lives. By day he is an " + \
          "average computer programmer and by night a hacker known as " + \
          "Neo. Neo has always questioned his reality, but the truth is " + \
          "far beyond his imagination. Neo finds himself targeted by the " + \
          "police when he is contacted by Morpheus, a legendary computer " + \
          "hacker branded a terrorist by the government. Morpheus awakens " + \
          "Neo to the real world, a ravaged wasteland where most of " + \
          "humanity have been captured by a race of machines that live " + \
          "off of the humans' body heat and electrochemical energy and " + \
          "who imprison their minds within an artificial reality known as " + \
          "the Matrix. As a rebel against the machines, Neo must return to " + \
          "the Matrix and confront the agents: super-powerful computer " + \
          "programs devoted to snuffing out Neo and the entire human " + \
          "rebellion."

    [dictionary, proc_text, sentences] = save_word_dict(text)

    tf_matrix = create_tf_matrix(proc_text, dictionary)
    tf_matrix = normalize_tf_matrix(tf_matrix, 0.3)

    # decompose in U x S X V matrices using SVD
    [u, s, v] = singular_value_decomposition(tf_matrix, full_matrices=False)

    reduction_ratio = 1.0
    dimension = s.shape[0]
    reduced_dimension = int(dimension * reduction_ratio)

    min_dimension = 1

    if (reduced_dimension < min_dimension):
        reduced_dimension = min_dimension

    s2 = numpy.array(s, copy=True)
    s2 = numpy.square(s2)

    if (reduced_dimension < dimension):
        for i in range(reduced_dimension, dimension):
            s2[i] = 0

    # http://textmining.zcu.cz/publications/PhDThesis-Steinberger.pdf
    # see page 25 - Sk = sqrt(sum(v * sigma^2 ))
    ranks = numpy.sqrt(numpy.square(v.T * s2).sum(axis=1))
    #print "ranks " , ranks
    ranked_sentences = sorted(range(len(ranks)),
                              key=lambda x: ranks[x],
                              reverse=True)
    #print "ranked_sentences ", ranked_sentences

    result_summary = ''
    for i in range(0, limit):
        result_summary = result_summary + ' ' + sentences[ranked_sentences[i]]

    system_summary = result_summary

    # if(reference_summary != None):
    #   try:
    #     reference_summary = summarize(text)
    #   except (ValueError, ZeroDivisionError):
    #     return -1

    # if(reference_summary == None or len(reference_summary) == 0 or len(reference_summary) > 140):
    #   return -1

    # write reference summary to file

    sys_dir = os.pardir + "/test-summarization/system/" + article_id + "_" + "system.txt"
    ref_dir = os.pardir + "/test-summarization/reference/" + article_id + "_" + "reference.txt"

    write_to_file(ref_dir, reference_summary)
    reference_summary_list.append([ref_dir])

    # write system summary to file
    write_to_file(sys_dir, system_summary)
    system_summary_list.append(sys_dir)
    # test_print(reference_summary, system_summary)
    return ranked_sentences
Exemple #16
0
def lsa_text_extraction(textdoc,
                        smooth=0.4,
                        MIN_DIMENSIONS=3,
                        REDUCTION_RATIO=1 / 1,
                        topn=5):
    """
    reduction_ratio: used to reduce computation cost: limit diagonal size, when it is 1 it keeps original diagonal size, when it is 0.4 only keep 0.4 * original diagonal size
    smooth: is a factor appened to matrix normalization, small value might cause overfitting and large value might cause underfitting
    """
    ''' document to sentences '''
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    document = tokenizer.tokenize(textdoc)
    ''' generate term freq matrix '''
    assert 0.0 <= smooth < 1.0
    preprocessed_text = textClean.pipeline(document,
                                           multi_gram=[1],
                                           lower_case=True,
                                           deacc=False,
                                           encoding='utf8',
                                           errors='strict',
                                           stem_lemma='lemma',
                                           tag_drop=[],
                                           nltk_stop=True,
                                           stop_word_list=[],
                                           check_numbers=False,
                                           word_length=2,
                                           remove_consecutives=True)
    dictionary = DocVector.generate_corpus_dict(preprocessed_text,
                                                no_below=2,
                                                no_above=0.5,
                                                keep_n=100000)
    doc_vec = DocVector.create_document_vector(preprocessed_text, dictionary)
    tfmatrix = DocVector.get_vocab_matrix(doc_vec, dictionary)
    matrix_copy = tfmatrix.values.T
    '''
    Computes TF metrics for each sentence (column) in the given matrix and  normalize 
    the tf weights of all terms occurring in a document by the maximum tf in that document 
    according to ntf_{t,d} = a + (1-a)\frac{tf_{t,d}}{tf_{max}(d)^{'}}.
        
    The smoothing term $a$ damps the contribution of the second term - which may be viewed 
    as a scaling down of tf by the largest tf value in $d$
    '''
    max_word_frequencies = np.max(matrix_copy, axis=0)
    rows, cols = matrix_copy.shape
    for row in range(rows):
        for col in range(cols):
            max_word_frequency = max_word_frequencies[col]
            if max_word_frequency != 0:
                frequency = matrix_copy[row, col] / max_word_frequency
                matrix_copy[row, col] = smooth + (1.0 - smooth) * frequency
    ''' get ranks '''
    u, sigma, v_matrix = singular_value_decomposition(matrix_copy,
                                                      full_matrices=False)
    assert len(sigma) == v_matrix.shape[0]
    dimensions = max(MIN_DIMENSIONS, int(len(sigma) * REDUCTION_RATIO))
    powered_sigma = tuple(s**2 if i < dimensions else 0.0
                          for i, s in enumerate(sigma))
    ranks = []
    for column_vector in v_matrix.T:
        rank = sum(s * v**2 for s, v in zip(powered_sigma, column_vector))
        ranks.append(math.sqrt(rank))
    ''' output result '''
    percentile_list = pd.DataFrame({
        'sentence': document,
        'rank': ranks,
    }).sort_values(by='rank', ascending=False)

    output_sentence = [i for i in percentile_list.head(topn)['sentence']]
    return output_sentence
 def computeSVD(self, B):
     return singular_value_decomposition(B,full_matrices=0)