def result(): if request.method == 'POST': Text = request.form['Text'] article = Article() article.addText(Text) #article.download(Url) #article.set_summary() #article.set_category() #print(article.get_thumbnailUrl()) result = { 'text': article.get_text(), 'category': article.get_category() } return render_template("result3.html", result=result)
def execute_with_params(self, file_pattern="", path="", magazines=None,articles=None): listOfMagazines=[] container = dict() for folder, subs, files in os.walk(path): with open(os.path.join(folder, file_pattern), 'w') as dest: docValue="" prevId="" identifier="" page_count=0 for filename in files: #print("Filename "+filename) if filename == file_pattern: pass elif filename.endswith(file_pattern): document="" if not folder in path: document = folder+"/"+filename else: document = path+filename doc = self.read(document) if magazines==None: pass #container = self.saveValuesForNewIdentifier(container, identifier, doc) elif len(doc)<1: pass else: fileId = self.getMagazine(filename) mag=magazines[fileId] if fileId not in listOfMagazines: listOfMagazines.append(fileId) page = self.extractPageNumberFromFilename(filename) if page > 0: article = mag.find_article_by_page(page) logger.debug(article) logger.debug(filename) #get filename name="" str_name = filename.split("page") try: name = str_name[0] except: name = filename mag.set_name(name) if article != None: #If article exists if len(doc)<5 and len(values[-1:])<3900: logger.info("SMALL "+str(len(doc)) + " = "+doc) pass if len(doc)>0: self.setLengths(len(doc)) #split value in case too long to process if len(doc)>3000: sentences="" splitted = doc.split(' ') for split in splitted: lenn = len(sentences) + len(split) + 1 #+1 for the space if lenn > 3000: article.addText(sentences, page) sentences = "" sentences += split+" " if len(sentences) > 0: article.addText(sentences, page) else: article.addText(doc, page) article.set_len(len(doc)) else: #in case we cannot find article article = Article(filename, page, "") article.addText(doc, page) article.set_len(len(doc)) self.setLengths(len(doc)) mag.add_article(article) else: #if article not found for the document, store pages page by page print("stored file "+filename+" as article was not found") article = Article(filename, page, "") article.addText(doc, page) article.set_len(len(doc)) mag.add_article(article) self.setLengths(len(doc)) magazines[fileId] = mag #mag.log_articles_and_contents() elif filename.endswith(".xml"): page_count = 1+page_count article = Article(filename, page, "") doc, document = self.readDocument(path, folder, filename, document) if len(doc)>0: xml = xmlParser(input_file=doc) if bool(BeautifulSoup(html, "html.parser").find()) == True: self.setLengths(len(doc)) article.set_len(len(doc)) html = htmlParser(doc) article = self.split_document(html.get_text(), article, page_count) result.append(article) else: #process pass elif filename.endswith(".html"): page_count = 1+page_count article = Article(filename, page, "") doc, document = self.readDocument(path, folder, filename, document) if len(doc)>0: self.setLengths(len(doc)) article.set_len(len(doc)) html = htmlParser(doc) article = self.split_document(html.get_text(), article, page_count) result.append(article) else: #process pass dest.close() result = [] logger.debug("VALUES FOR magazines "+str(len(listOfMagazines))) for id in listOfMagazines: if magazines[id] not in result: result.append(magazines[id]) return result, listOfMagazines