def parse_line(self): line_number = 1 qline_begin = self.appconfig.get_value('question_lines', 'start') qline_end = self.appconfig.get_value('question_lines', 'end') possible_answer_line = self.appconfig.get_value( 'possible_answers_line') for line in self.readlines(): if line_number == self.appconfig.get_value('paragraph_line'): para = Paragraph(line, self.appconfig) para.parse() yield ("paragraph", para) elif line_number >= qline_begin and line_number <= qline_end: line.replace('?', '') qa = QuestionAnswer(line) yield ("question_answer", qa) elif line_number == possible_answer_line: pa = PossibleAnswers(line, self.appconfig) pa.parse() yield ("possible_answers", pa) else: raise ValueError('Invalid input, number of lines exceeded') line_number += 1
def test_init(self): sentence = '/>あいうえお' p = Paragraph(sentence) self.assertEqual(p.sentences, ['あいうえお']) sentence = 'あいうえお<a href="somewhere">かき</a>くけこ' p = Paragraph(sentence) self.assertEqual(p.sentences, ['あいうえおかきくけこ'])
def generate_novel(): characters = [Person(), Person(), Person()] for i in range(5): scene_settings.append(generate_setting()) for setting in scene_settings: paragraph = Paragraph(characters, setting) paragraph.generate_sentences() print paragraph
def make_paragraphs(raw_data_path, decorated_data_path, urn, aux_info): text, info = "", "" # from urn aux = decorate(info, aux_info) paragraphs = text.split('\n') for p in paragraphs: pg = PG(p, info) pg.set_aux_info(aux) pg.save_to_file(decorated_data_path)
def save_section_features_to_db(self, paper_ids, list_authors, list_authors_id_200): con = psycopg2.connect("dbname ='%s' user='******' host=/tmp/" % (self.db_name.lower(), getpass.getuser())) cur = con.cursor() index = {} for m in list_authors_id_200: index[m] = 0 num_section = 1 chunk_id = 1 for i in range(0, self.num_paper): tokens_sum = [] for j in range(0, len(list_authors[i])): novel_id = paper_ids[i] index[list_authors[i][j]] += 1 raw_novel_text = self.get_raw_text(novel_id) tokens = nltk.word_tokenize(raw_novel_text.decode('utf-8')) tokens_sum += tokens[0:self.token_size / len(list_authors[i])] cur.execute("INSERT INTO section VALUES(%s,%s,%s,%s,%s)", [ i + 1, num_section, raw_novel_text, novel_id, list_authors[i][j] ]) num_section += 1 paragraphs = self.get_paragraphs(tokens_sum) for x in range(0, len(paragraphs)): para = Paragraph("paper_id", para=paragraphs[x]) stylo_list = [] try: stylo_list = para.get_stylo_list() except: raise print('error') for y in range(0, 57): feature_id = y + 1 try: value = 0 if math.isnan( stylo_list[y]) else stylo_list[y] except: value = 0 cur.execute( "INSERT INTO features VALUES (%s, %s, %s, %s) " % (i + 1, chunk_id, feature_id, value)) chunk_id += 1 con.commit() print("saved section no %s" % (i + 1)) con.close() cur.close()
def draw(self,page,context): if not self.reportElement.is_printable(context): return P=Paragraph(self.text.data,self.textElement.get_style()) w,h = P.wrap(self.reportElement['width'],self.reportElement['height']) x,y = page.translate(self.reportElement['x'],self.reportElement['y']) P.drawOn(page.canvas,x,y)
def test_split_by_dots(self): p = Paragraph('a') html_items = p.split_by_dots('abc') self.assertEqual(html_items, ['abc']) html_items2 = p.split_by_dots('a!bc') self.assertEqual(html_items2, ['a', 'bc']) html_items2 = p.split_by_dots('a。bc') self.assertEqual(html_items2, ['a', 'bc'])
def test_contains_nomenclature(self): self.pp.append_ahead(Line('hamster')) self.pp.append_ahead( Line('≡ Polyporus mori (Pollini) Fr., Systema Mycologicum 1:')) self.pp.append_ahead(Line('344 (1821)')) self.pp.close() self.assertTrue(self.pp.contains_nomenclature()) pp2 = Paragraph() pp2.append(Line('Araneosa columellata Long, Mycologia 33 (1941) 353.')) self.assertTrue(pp2.contains_nomenclature())
def setUp(self): self.pp = Paragraph() self.pp.append_ahead( Line( 'Julella sublactea (Nylander) R.C. Harris in Egan, Bryologist 90: 163. 1987;\n' )) self.pp.append_ahead( Line( 'Verrucaria sublactea Nylander, Flora 69: 464. 1886. syn. nov.\n' )) self.pp.close()
def load_docs(docs_file_path): docs = [] DOMTree = minidom.parse(docs_file_path) collection = DOMTree.documentElement docElements = collection.getElementsByTagName("doc") for docEl in docElements: paragraphs = [] paragraphElements = docEl.getElementsByTagName("p") for pEl in paragraphElements: concepts = {} conceptElements = pEl.getElementsByTagName("concept") for cEl in conceptElements: name = cEl.getAttribute("name") freq = int(cEl.getAttribute("freq")) concepts[name] = freq p = Paragraph(concepts) paragraphs.append(p) doc = Document(paragraphs) docs.append(doc) return docs
def draw(self,page,context): if not self.reportElement.is_printable(context): return value = self.resolve_expression(self.fieldExpression.data,context) if value is None and self['isBlankWhenNull']: value = '' else: value = self.format(value) P=Paragraph(value,self.textElement.get_style()) w,h = P.wrap(self.reportElement['width'], self.reportElement['height']) x,y = page.translate(self.reportElement['x'],self.reportElement['y']) P.drawOn(page.canvas,x,y)
def summarize(self, content): summarized_content_array = [] for paragraph in content: index = paragraph.index title = paragraph.title content = self.summarize_paragraph(paragraph) p = Paragraph(index, title, content) summarized_content_array.append(p) return summarized_content_array
def write_paragraph(self, section_number, content): """ Writes paragraph to paragraph diction with section as key, content as value""" if section_number == 0: title = "Summary" else: title = self.get_section(section_number - 1) paragraph = Paragraph(section_number, title, content) self.paragraphs.append(paragraph)
def document(self): current_paragraph = [] paragraphs = [] for line in self._text.splitlines(): line = line.strip() if line.isupper(): heading = Sentence(line, self._tokenizer, is_heading=True) current_paragraph.append(heading) elif not line and current_paragraph: sentences = self._to_sentences(current_paragraph) paragraphs.append(Paragraph(sentences)) current_paragraph = [] elif line: current_paragraph.append(line) sentences = self._to_sentences(current_paragraph) paragraphs.append(Paragraph(sentences)) return ObjectDocumentModel(paragraphs)
def summarize(self, content): summarized_content_array = [] for paragraph in content: summarized_content = self.get_first_sentance(paragraph.content) title = paragraph.title index = paragraph.index summarized_paragraph = Paragraph(index, title, summarized_content) summarized_content_array.append(summarized_paragraph) return summarized_content_array
def read_raw_text(self, raw_text_path): characters = ''.join(open(raw_text_path).readlines()) begin = 8 # each article begins with ".START\n\n" pid = 0 while 1: pc = characters.find('\n\n', begin) if pc == -1: break self.paragraphs.append(Paragraph(begin, pc, pid)) pid += 1 begin = pc + 2 # '\n\n' for sen in self.sentences: flag = False for para in self.paragraphs: if sen.begin_offset >= para.begin_offset and sen.end_offset <= para.end_offset: para.sentences.append(sen) flag = True break if not flag: print >> logs, 'sentence outof paragraph'
def parse_annotated(contents: Iterable[Line]) -> Iterator[Paragraph]: """Return paragraphs in annotated block form. Do not apply heuristic methods to divide paragraphs.""" pp = Paragraph() for line in contents: pp.append_ahead(line) if line.contains_start(): (retval, pp) = pp.next_paragraph() yield retval continue if pp.last_line and pp.last_line.end_label() is not None: (retval, pp) = pp.next_paragraph() yield retval continue
def retrieve_forbes_article_data(article, company_list, company_name_automaton): article.paragraphs = [] browser.get((article.url)) try: article_text_section = browser.find_element_by_xpath( '//*[@id="article-container-0"]/div[2]/div[2]/article-body-container/div/div' ) article_paragraph_tags = article_text_section.find_elements_by_tag_name( 'p') except: return all_tickers = [] for paragraph in article_paragraph_tags: if '[+]' in paragraph.text: continue tickers = [] for end_index, idx in company_name_automaton.iter( paragraph.text.upper()): company_formatted = company_list[idx] company_formatted = company_formatted[1:] company_formatted = company_formatted[:-1] tickers.append(company_formatted) all_tickers.extend(tickers) article.paragraphs.append(Paragraph(paragraph.text, tickers)) article.webpage_text = article.webpage_text + paragraph.text article.tickers = all_tickers article.tickers = list(dict.fromkeys(article.tickers)) return article
def reset(self): super().reset() self.state = 'start' self.paragraph = Paragraph()
def __init__(self, paragraphs_actions: List[ParagraphsAction] = []): super().__init__() self.paragraphs_actions = paragraphs_actions self.paragraph = Paragraph()
class BlogParser(MachineHTMLParser): paragraphs_actions: List[ParagraphsAction] paragraph: Paragraph def __init__(self, paragraphs_actions: List[ParagraphsAction] = []): super().__init__() self.paragraphs_actions = paragraphs_actions self.paragraph = Paragraph() # utilities def parse_file(self, filename: str, rel: Optional[str] = None): self.reset() if isinstance(rel, str): self.paragraph.filename = Path(rel).relative_to(rel) else: self.paragraph.filename = filename super().parse_file(filename) def parse_date(self, time: str) -> str: time_ = time.strip() fmt = '%A, %b %d, %Y' # example: Monday, Jan 1, 2015 try: return datetime.strptime(time_, fmt).isoformat() except ValueError: log.error(f'Invalid date format:"{time_}"') log.error(f'Invalid date format @ {self.location()}') return "" # reduce middleware once paragraph is read def push_paragraph(self): reduce(lambda x, f: f(x), self.paragraphs_actions, [self.paragraph]) self.paragraph = self.paragraph.new_paragraph() # state-machine logic def reset(self): super().reset() self.state = 'start' self.paragraph = Paragraph() def validate_transition(self, to_state: State): return (self.state, to_state) in valid_transitions def dispatch(self, ms: TransitionData, attrs: Attrs = {}): if ms == ('start', 'header', 'starttag'): self.transition('metadata') elif ms == ('metadata', 'h1', 'starttag'): self.transition('title') elif ms == ('metadata', 'Author', 'DATA'): self.transition('author_1') elif ms == ('metadata', 'Date', 'DATA'): self.transition('date_1') elif ms == ('metadata', 'header', 'endtag'): self.transition('article') elif ms == ('title', '*', 'DATA'): self.paragraph.article_title = ms.tagOrData elif ms == ('title', 'h1', 'endtag'): self.transition('metadata') elif ms == ('author_1', 'p', 'starttag'): self.transition('author_2') elif ms == ('author_2', '*', 'DATA'): self.paragraph.author = ms.tagOrData self.transition('metadata') elif ms == ('date_1', 'p', 'starttag'): self.transition('date_2') elif ms == ('date_2', '*', 'DATA'): self.paragraph.date = self.parse_date(ms.tagOrData) self.transition('metadata') elif ms == ('article', 'h[23]', 'starttag'): self.push_paragraph() self.transition('subtitle') elif ms == ('subtitle', 'h[23]', 'endtag'): self.transition('article') elif ms == ('article', 'article', 'endtag'): self.push_paragraph() self.transition('done') elif ms == ('article', '*', 'DATA'): self.paragraph.text += ms.tagOrData # we keep <code> and <p> tags for use in chunking text later elif ms == ('article', 'p', 'starttag'): self.paragraph.text += "<p>" elif ms == ('article', 'p', 'endtag'): self.paragraph.text += "</p>" elif ms == ('article', 'code', 'starttag'): self.paragraph.text += '<code>' elif ms == ('article', 'code', 'endtag'): self.paragraph.text += '</code>' elif ms == ('subtitle', '*', 'DATA'): self.paragraph.paragraph_title += ms.tagOrData
def endElement(self, name): if name == 'empty-line': self.add_empty_line() elif self.is_body and name != 'body': data = ''.join(self.cur_data) if name in ['strong', 'emphasis', 'a', 'style']: if not self.cur_attr: ##print 'FB2 PARSER ERROR: nested styles?' return self.cur_attr.insert(1, len(data)) self.attrs.append(self.cur_attr) self.cur_attr = [] return if data and data.strip(): if self.is_title: self.add_empty_line() self.content.append( Paragraph( 'title', data, attrs=self.attrs, lang=self.lang, id=self.cur_id, byte_index=_parser.CurrentByteIndex, ) ) elif self.is_epigraph and name == 'p': self.content.append( Paragraph( 'epigraph', data, attrs=self.attrs, lang=self.lang, id=self.cur_id, byte_index=_parser.CurrentByteIndex ) ) elif self.is_cite and name == 'p': self.content.append( Paragraph( 'cite', data, attrs=self.attrs, lang=self.lang, id=self.cur_id, byte_index=_parser.CurrentByteIndex ) ) else: self.content.append( Paragraph( name, data, attrs=self.attrs, lang=self.lang, id=self.cur_id, byte_index=_parser.CurrentByteIndex ) ) self.prev_paragraph_is_empty = False self.attrs = [] self.id = None if name == 'description': self.is_desc = False elif name == 'body': self.is_body = False elif name == 'epigraph': self.is_epigraph = False self.add_empty_line() elif name == 'cite': self.is_cite = False elif name == 'title': self.is_title = False self.add_empty_line() elif name in ('subtitle', 'image', 'poem'): self.add_empty_line() elif name == 'lang': self.lang = ''.join(self.cur_data).strip() #del self.elem_stack[-1] self.cur_data = [] self.links = {}
def add_empty_line(self): if not self.prev_paragraph_is_empty: self.content.append(Paragraph('empty-line', '', byte_index=_parser.CurrentByteIndex)) self.prev_paragraph_is_empty = True
def writeCoursesWithDescriptions(ws): global document global courses global coursesWithDescriptions courseDescriptionOn = 'false' allparagraphs = [] pertinentParagraphs = [] global number global default_format global description_format writeFile = "pertinentParagraphs.csv" pHeader = "" headerIsCourseTitle = 'false' for p in document.paragraphs: if p.text.strip() != "": number = number + 1 paragraph = Paragraph() paragraph.setNumber(number) paragraph.setStyle(p.style.name) paragraph.setText(p.text.strip().rstrip(string.digits)) allparagraphs.append(paragraph) for i, item in enumerate(allparagraphs): if item.getStyle() == constant.STYLE_BODY_TEXT: pPrev = allparagraphs[i - 1] pHeader = "" if pPrev.getStyle() == constant.STYLE_NORMAL: number = number + 1 p2 = Paragraph() p2.setNumber(number) p2.setStyle(pPrev.getStyle()) p2.setText(pPrev.getText()) paragraphs2.append(p2) pHeader = pPrev.getText().strip() if pHeader != "": headerIsCourseTitle = HeaderIsCourseTitle(pHeader) if headerIsCourseTitle == 'true': number = number + 1 pc = Paragraph() pc.setNumber(number) pc.setStyle(item.getStyle()) pc.setHeader(pHeader) pc.setText(item.getText()) pertinentParagraphs.append(pc) associatedCourse = Course() currentCourseDescription = "" courseTitlePosition = -1 row = 0 col = 0 ws.write(row, col, 'Knowledge Area') col += 1 ws.write(row, col, 'Course Title') col += 1 ws.write(row, col, 'Description') col = 0 row += 1 for i, p in enumerate(pertinentParagraphs): courseTitleCandidate = p.getHeader().strip() if p.getHeader().strip( ) != "" and courseDescriptionOn == 'true' and i > courseTitlePosition: newCourse = Course() newCourse.setKnowledgeArea(associatedCourse.getKnowledgeArea()) newCourse.setTitle(associatedCourse.getTitle()) newCourse.setDescription(currentCourseDescription) coursesWithDescriptions.append(newCourse) courseDescriptionOn = 'false' ws.write(row, col, associatedCourse.getKnowledgeArea()) col += 1 ws.write(row, col, associatedCourse.getTitle()) col += 1 ws.write(row, col, currentCourseDescription) col = 0 row += 1 currentCourseDescription = "" courseTitlePosition = -1 if p.getHeader().strip() != "" and courseDescriptionOn == 'false': courseDescriptionOn = 'true' associatedCourse = GetAssociatedCourse(p.getHeader().strip()) courseTitlePosition = i currentCourseDescription += p.getText().strip() if courseDescriptionOn == 'true' and i > courseTitlePosition: currentCourseDescription += p.getText().strip() ws.set_column(0, 0, 35, default_format) ws.set_column(1, 1, 50, default_format) ws.set_column(2, 2, 120, description_format)
def writeDocStyles(): global number writeFile = "docStyles_fall.csv" wrawFile = "docStylesRaw_fall.csv" global document with open(writeFile, 'w') as output: fieldnames = ['number', 'style', 'text'] writer = csv.DictWriter(output, fieldnames=fieldnames) writer.writeheader() for p in document.paragraphs: if p.text.strip() != "": number = number + 1 paragraph = Paragraph() runCnt = 0 for r in p.runs: runCnt += 1 run = Run() run.setBold(str(r.bold)) run.setFont(r.font.name) run.setItalic(str(r.font.italic)) paragraph.setRuns(run) paragraph.setNumber(number) paragraph.setStyle(p.style.name) paragraph.setText(p.text.strip().rstrip(string.digits)) paragraph.setRunCount(runCnt) paragraphs1.append(paragraph) number = 0 i = 0 for i, item in enumerate(paragraphs1): if item.getStyle() == constant.STYLE_BODY_TEXT: pPrev = paragraphs1[i - 1] if pPrev.getStyle() == constant.STYLE_NORMAL: number = number + 1 p2 = Paragraph() p2.setNumber(number) p2.setStyle(pPrev.getStyle()) p2.setText(pPrev.getText()) paragraphs2.append(p2) number = number + 1 pc = Paragraph() pc.setNumber(number) pc.setStyle(item.getStyle()) pc.setText(item.getText().strip()) paragraphs2.append(pc) for paragraph in paragraphs2: writer.writerow({ 'number': paragraph.getNumber(), 'style': paragraph.getStyle(), 'text': paragraph.getText() }) with open(wrawFile, "w") as rawoutput: fieldnames = ['number', 'style', 'runs', 'font', 'bold', 'text'] rwriter = csv.DictWriter(rawoutput, fieldnames=fieldnames) rwriter.writeheader() for paragraph in paragraphs1: for run in paragraph.getRuns(): rwriter.writerow({ 'number': paragraph.getNumber(), 'style': paragraph.getStyle(), 'runs': paragraph.getRunCount(), 'font': run.getFont(), 'bold': run.getBold(), 'text': paragraph.getText() })
def parse_paragraphs(contents: Iterable[Line]) -> Iterator[Paragraph]: pp = Paragraph() for line in contents: pp.append_ahead(line) next_pp = pp.split_at_nomenclature() if next_pp: if not pp.is_empty(): yield pp (retval, pp) = next_pp.next_paragraph() yield retval continue # New document triggers a new paragraph. if pp.last_line and pp.last_line.filename != line.filename: (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # Page break triggers a new paragraph. if line.startswith(''): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # Page break is a whole paragraph. if pp.is_page_header(): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # Leading tab triggers a new paragraph. if line.startswith('\t'): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # Tables start with a few long lines and # continue to grow as long as we have short lines. if pp.is_table(): if line.is_short(pp.short_line): continue else: if pp.is_all_long(): continue (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # Blocks of blank lines are a paragraph. if pp.is_blank(): if line.is_blank(): continue (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # Figures end with a blank line, or period or colon at the end # of a line. if pp.is_figure(): if (not line.is_blank() and not pp.detect_period() and not pp.endswith(':')): continue (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # Leading hyphen triggers a new paragraph. if line.startswith('-'): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # A table starts a new paragraph. if pp.next_line.is_table(): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # Synonymy reference ends a taxon. if pp.last_line and pp.last_line.search(r'\([Ss]yn.*\)$'): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # A taxon ends in nov., nov. comb., nov. sp., ined., # emend. (followed by emender), or nom. sanct. if pp.last_line and pp.last_line.search( r'(nov\.|nov\.\s?(comb\.|sp\.)|[(]?in\.?\s?ed\.[)]?|' r'[(]?nom\.\s?sanct\.[)]?|emend\..*)$' ): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # A short line ends a paragraph. if pp.last_line and pp.last_line.is_short(pp.short_line): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue # A blank line ends a paragraph. if line.is_blank(): (retval, pp) = pp.next_paragraph() if not retval.is_empty(): yield retval continue
def main(): args = define_args() Paragraph.set_reinterpretations(args.reinterpret) if args.dump_files: print('\ntraining_files:', args.training_files) print('\nevaluate_files:', args.evaluate_files) classifiers = [ BernoulliNB(), RandomForestClassifier(n_estimators=100, n_jobs=-1), AdaBoostClassifier(), BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), CalibratedClassifierCV(), DummyClassifier(), PassiveAggressiveClassifier(max_iter=5, tol=None), RidgeClassifier(), RidgeClassifierCV(), SGDClassifier(max_iter=5, tol=-np.infty), OneVsRestClassifier(SVC(kernel='linear')), OneVsRestClassifier(LogisticRegression()), KNeighborsClassifier() ] vectorizers = [ CountVectorizer(), TfidfVectorizer(), HashingVectorizer() ] fast_classifiers = [ BernoulliNB(), RandomForestClassifier(n_estimators=100, n_jobs=-1), AdaBoostClassifier(), # BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), DecisionTreeClassifier(), CalibratedClassifierCV(), DummyClassifier(), PassiveAggressiveClassifier(max_iter=5, tol=None), RidgeClassifier(), # RidgeClassifierCV(), SGDClassifier(max_iter=5, tol=-np.infty), # OneVsRestClassifier(SVC(kernel='linear')), OneVsRestClassifier(LogisticRegression()), # KNeighborsClassifier() # Actually not slow, but we run out of memory. ] fast_vectorizers = [ CountVectorizer(), TfidfVectorizer(), # HashingVectorizer() ] if args.fast: classifiers = fast_classifiers vectorizers = fast_vectorizers try: i = [c.__class__.__name__ for c in classifiers].index(args.classifier) except ValueError: raise ValueError('Unknown classifier %s' % args.classifier) classifier = classifiers[i] try: i = [v.__class__.__name__ for v in vectorizers].index(args.vectorizer) except ValueError: raise ValueError('Unknown vectorizer %s' % args.vectorizer) vectorizer = vectorizers[i] if args.training_files: contents = read_files(args.training_files) if args.annotated_paragraphs: phase1 = parse_annotated(contents) else: phase1 = parse_paragraphs(contents) if 1 in args.dump_phase: print('Phase 1') print('=======') phase1 = list(phase1) print(repr(phase1)) if 1 == max(args.dump_phase): sys.exit(0) if args.keep_interstitials: phase2 = phase1 else: phase2 = remove_interstitials(phase1) phase1 = None # Potentially recover memory. if 2 in args.dump_phase: print('Phase 2') print('=======') phase2 = list(phase2) print(repr(phase2)) if 2 == max(args.dump_phase): sys.exit(0) # All labels need to be resolved for this phase. The easiest way # to assure this is to convert to list. phase3 = target_classes( list(phase2), default=Label('Misc-exposition'), keep=[Label(l) for l in args.labels] ) if args.dump_input: phase3 = list(phase3) if args.output_annotated: if not args.output_labels: print('\n'.join([pp.as_annotated() for pp in phase3])) else: print('\n'.join([pp.as_annotated() for pp in phase3 if pp.top_label() in args.output_labels])) else: print('\n'.join([str(pp) for pp in phase3])) phase2 = None if 3 in args.dump_phase: print('Phase 3') print('=======') phase3 = list(phase3) print(repr(phase3)) if 3 == max(args.dump_phase): sys.exit(0) phase3 = list(phase3) sample_size = len(phase3) if args.group_paragraphs: writer = csv.DictWriter(sys.stdout, fieldnames=Taxon.FIELDNAMES) writer.writeheader() for taxon in group_paragraphs(phase3): for d in taxon.dictionaries(): writer.writerow(d) sys.exit(0) np.random.seed(SEED) cutoff = int(sample_size * 0.70) permutation = np.random.permutation(phase3) phase3 = None learn = paragraph.to_dataframe(permutation[:cutoff], args.suppress_text) test = paragraph.to_dataframe(permutation[cutoff:], args.suppress_text) if args.test_classifiers: perform( classifiers, vectorizers, learn, test ) sys.exit(0) if args.test_classifiers_by_label: perform_confusion_matrix( classifiers, vectorizers, learn, test, emit_csv=args.csv ) sys.exit(0) # train or load models if args.load_vectorizer: vectorizer = joblib.load(args.load_vectorizer) classifier = joblib.load(args.load_classifier) else: vectorize_text = vectorizer.fit_transform(learn.v2) classifier.fit(vectorize_text, learn.v1) # Dump trained models. if args.dump_vectorizer: joblib.dump(vectorizer, args.dump_vectorizer) if args.dump_classifier: joblib.dump(classifier, args.dump_classifier) if args.evaluate_files: phase4 = [] # predict if args.keep_interstitials: evaluated = ( parse_paragraphs(read_files(args.evaluate_files))) else: evaluated = remove_interstitials( parse_paragraphs(read_files(args.evaluate_files))) for pp in evaluated: text = str(pp) vectorize_text = vectorizer.transform([text]) predict = classifier.predict(vectorize_text)[0] if args.insert_nomenclature and pp.contains_nomenclature(): predict = 'Nomenclature' phase4.append(pp.replace_labels(labels=[Label(predict)])) if args.output_annotated: if not args.output_labels: print('\n'.join([pp.as_annotated() for pp in phase4])) else: print('\n'.join([pp.as_annotated() for pp in phase4 if pp.top_label() in args.output_labels])) if 4 in args.dump_phase: print('Phase 4') print('=======') print(repr(phase4)) if 4 == max(args.dump_phase): sys.exit(0)
def ExtractCourseDescriptions(document, courses): courseDescriptionOn = 'false' allparagraphs = [] pertinentParagraphs = [] number = 0 pHeader = "" headerIsCourseTitle = 'false' partialTitle = [] fulltitle = "" for p in document.paragraphs: if p.text.strip() != "": pText = p.text.strip() lastPos = len(pText) if pText[lastPos - 1] == ":": partialTitle.append(pText) else: if len(partialTitle) > 0: partialTitle.append(p.text.strip()) fulltitle = ' '.join(partialTitle) else: fulltitle = pText number = number + 1 paragraph = Paragraph() paragraph.setNumber(number) paragraph.setStyle(p.style.name) paragraph.setText(p.text.strip().rstrip(string.digits)) allparagraphs.append(paragraph) for i, item in enumerate(allparagraphs): if item.getStyle() == constant.STYLE_BODY_TEXT: pPrev = allparagraphs[i - 1] pHeader = "" if pPrev.getStyle() == constant.STYLE_NORMAL: number = number + 1 p2 = Paragraph() p2.setNumber(number) p2.setStyle(pPrev.getStyle()) p2.setText(pPrev.getText()) paragraphs2.append(p2) pHeader = pPrev.getText().strip() if pHeader != "": headerIsCourseTitle = HeaderIsCourseTitle(pHeader) if headerIsCourseTitle == 'true': number = number + 1 pc = Paragraph() pc.setNumber(number) pc.setStyle(item.getStyle()) pc.setHeader(pHeader) pc.setText(item.getText()) pertinentParagraphs.append(pc) associatedCourse = Course() currentCourseDescription = "" courseTitlePosition = -1 for i, p in enumerate(pertinentParagraphs): courseTitleCandidate = p.getHeader().strip() #print("courseTitleCandidate: {}".format(courseTitleCandidate)) if p.getHeader().strip( ) != "" and courseDescriptionOn == 'true' and i > courseTitlePosition: associatedCourse.setDescription(currentCourseDescription) courseDescriptionOn = 'false' currentCourseDescription = "" courseTitlePosition = -1 if p.getHeader().strip() != "" and courseDescriptionOn == 'false': courseDescriptionOn = 'true' associatedCourse = GetAssociatedCourse(courses, p.getHeader().strip()) #print("return from getAssociatedCourse(): {}".format(associatedCourse.getTitle())) courseTitlePosition = i currentCourseDescription += p.getText().strip() if courseDescriptionOn == 'true' and i > courseTitlePosition: currentCourseDescription += p.getText().strip()
img = PIL.Image.open(join(img_dir, img_list[page])) print('image loaded') img: Image width = img.width name_font = load_font( join( current_dir, try_get(settings, 'name_font', 'fonts/FZY3JW.TTF', quiet=True)), int(get_best_font_size(width) * name_scale_factor)) word_font = load_font( join( current_dir, try_get(settings, 'word_font', 'fonts/FZY3JW.TTF', quiet=True)), int(get_best_font_size(width))) paragraph = Paragraph(width - border_width * 2) for i in range(length): name = names[i] word = words[i] if name != '': name_color = try_get(name_color_dict, name, default_text_color, quiet=True) name_block = TextBlock(name + ': ', name_font, name_color) paragraph.add_text_block(name_block) else: name_color = default_text_color word_block = TextBlock(word, word_font, name_color)
def ExtractCourseDescriptions(document: Document, courses: List[Course]) -> None: """ Extract the course descriptions from the document. Write them to the appropriate Course records in the database. :param document: a Python-Docx Document object :param courses: a List of Course objects """ courseDescriptionOn = False allparagraphs = [] pertinentParagraphs = [] number = 0 pHeader = "" hIsCourseTitle = False partialTitle = [] global courseNameStyles for p in document.paragraphs: if p.text.strip() != "": pText = p.text.strip() lastPos = len(pText) if pText[lastPos-1] == ":": partialTitle.append(pText) else: if len(partialTitle) > 0: partialTitle.append(p.text.strip()) number = number + 1 paragraph = Paragraph() paragraph.setNumber(number) paragraph.setStyle(p.style.name) paragraph.setText(p.text.strip().rstrip(string.digits)) allparagraphs.append(paragraph) for i, item in enumerate(allparagraphs): if item.getStyle() == constant.STYLE_BODY_TEXT: pPrev = allparagraphs[i-1] pHeader = "" if pPrev.getStyle() in courseNameStyles: number = number + 1 p2 = Paragraph() p2.setNumber(number) p2.setStyle(pPrev.getStyle()) p2.setText(pPrev.getText()) paragraphs2.append(p2) pHeader = pPrev.getText().strip() if pHeader != "": hIsCourseTitle = HeaderIsCourseTitle(pHeader) if hIsCourseTitle: number = number + 1 pc = Paragraph() pc.setNumber(number) pc.setStyle(item.getStyle()) pc.setHeader(pHeader) pc.setText(item.getText()) pertinentParagraphs.append(pc) associatedCourse = Course() currentCourseDescription = "" courseTitlePosition = -1 for i, p in enumerate(pertinentParagraphs): #if p.getHeader().strip() != "": #print("paragraph header: {}".format(p.getHeader().strip())) if p.getHeader().strip() != "" and courseDescriptionOn and i > courseTitlePosition: associatedCourse.setDescription(currentCourseDescription) courseDescriptionOn = False currentCourseDescription = "" courseTitlePosition = -1 if p.getHeader().strip() != "" and not courseDescriptionOn: courseDescriptionOn = True associatedCourse = GetAssociatedCourse(courses, p.getHeader().replace("'","").strip()) courseTitlePosition = i currentCourseDescription += p.getText().strip() if courseDescriptionOn and i > courseTitlePosition: currentCourseDescription += p.getText().strip()
if __name__ == '__main__': stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() documents = [] vocabulary = set() files = os.listdir('corpus') for i, file in enumerate(files): with open('corpus/' + file, encoding="utf8", errors='ignore') as f: raw = f.read() paras = paragraph_tokenizer(raw) paragraphs = [] for j, para in enumerate(paras): # Preprocessing tokens = preprocessor(para) _id = (i, j) paragraph = Paragraph(_id, tokens) paragraphs.append(paragraph) for term in tokens: vocabulary.add(term) document = Document(i, paragraphs) documents.append(document) # Length of vocabulary vocabularyLength = len(vocabulary) # print(vocabularyLength) # Creating the inverted index indexer = Indexer(documents) # Take filename as input for processing inputDocument = str(sys.argv[1])