def build_document_from_string(string): sentences = [] paragraphs = [] for line in string.strip().splitlines(): line = line.lstrip() if line.startswith("# "): sentences.append(build_sentence(line[2:], is_heading=True)) elif not line: paragraphs.append(Paragraph(sentences)) sentences = [] else: sentences.append(build_sentence(line)) paragraphs.append(Paragraph(sentences)) return ObjectDocumentModel(paragraphs)
def document(self): # a abbr acronym b big blink blockquote cite code # dd del dfn dir dl dt em h h1 h2 h3 h4 # h5 h6 i ins kbd li marquee menu ol pre q # s samp strike strong sub sup tt u ul var headers = 'h1', 'h2', 'h3' annotated_text = self._article.main_text paragraphs = [] for paragraph in annotated_text: sentences, current_text = [], '' for (text, annotations) in paragraph: if annotations and any(h_tag in annotations for h_tag in headers): sentences.append( Sentence(text, self._tokenizer, is_heading=True)) elif not (annotations and 'pre' in annotations): # skip <pre> nodes current_text += ' ' + text new_sentences = self.tokenize_sentences(current_text) sentences.extend( Sentence(s, self._tokenizer) for s in new_sentences) paragraphs.append(Paragraph(sentences)) return ObjectDocumentModel(paragraphs)
def calc_quality(filename): with open(filename) as in_file: my_data = json.load(in_file) with open("stopwords.txt") as s_file: my_stopword_list = s_file.read().split() summarizer = lex_rank_modified.LexRankSummarizer() summarizer.stop_words = my_stopword_list matrix_dict = {} n_docs = 0 for doc in my_data: processed_sents = list(map(to_sentence, doc['responses'])) my_paragraphs = [Paragraph(processed_sents)] my_doc = ObjectDocumentModel(my_paragraphs) ratings = summarizer(my_doc) feat_matrix = numpy.matrix(ratings).transpose() qual_matrix = numpy.exp(feat_matrix) matrix_dict['q' + str(n_docs)] = qual_matrix n_docs += 1 #print n_docs scipy.io.savemat('qual.mat', matrix_dict) return
def create_sumy_dom(text, tokenizer): """Creates a sumy style document from the sentences. **TODO: Assumes that paragraphs are specified by lines starting with a space """ from sumy.models.dom import Sentence, Paragraph, ObjectDocumentModel paragraphs = [] paragraph = [] for ii, line in enumerate(text): if line[0] != ' ' and ii > 0: # Last line was the last one in paragraph paragraphs.append(Paragraph(paragraph)) # Dump paragraph paragraph = [] # start new paragraph going forward # Process current line paragraph.append(Sentence(line, tokenizer)) if ii + 1 == len(text): # if last line just dump paragraphs.append(Paragraph(paragraph)) return ObjectDocumentModel(tuple(paragraphs))
def build_document(*sets_of_sentences): paragraphs = [] for sentences in sets_of_sentences: sentence_instances = [] for sentence_as_string in sentences: sentence = build_sentence(sentence_as_string) sentence_instances.append(sentence) paragraphs.append(Paragraph(sentence_instances)) return ObjectDocumentModel(paragraphs)
def test_only_instances_of_sentence_allowed(): document = build_document_from_string(""" Nějaký muž šel kolem naší zahrady Nějaký jiný muž šel kolem vaší zahrady # Nová myšlenka Už už abych taky šel """) with pytest.raises(TypeError): Paragraph(list(document.sentences) + ["Last sentence"])