Esempio n. 1
0
def result():
    if request.method == 'POST':
        Text = request.form['Text']
        article = Article()
        article.addText(Text)
        #article.download(Url)
        #article.set_summary()
        #article.set_category()
        #print(article.get_thumbnailUrl())

        result = {
            'text': article.get_text(),
            'category': article.get_category()
        }
        return render_template("result3.html", result=result)
Esempio n. 2
0
 def execute_with_params(self, file_pattern="", path="", magazines=None,articles=None):
     listOfMagazines=[]
     container = dict()
     for folder, subs, files in os.walk(path):
         with open(os.path.join(folder, file_pattern), 'w') as dest:
             docValue=""
             prevId=""
             identifier=""
             page_count=0
             for filename in files:
                 #print("Filename "+filename)
                 if filename == file_pattern:
                     pass                    
                 elif filename.endswith(file_pattern):
                     document=""
                     if not folder in path:
                         document = folder+"/"+filename
                     else:
                         document = path+filename
                     doc = self.read(document)
                     if magazines==None:
                         pass
                         #container = self.saveValuesForNewIdentifier(container, identifier, doc)
                     elif len(doc)<1:
                         pass
                     else:
                         fileId = self.getMagazine(filename)
                         mag=magazines[fileId]
                         if fileId not in listOfMagazines:
                             listOfMagazines.append(fileId)
                         page = self.extractPageNumberFromFilename(filename)
                         if page > 0:
                             article = mag.find_article_by_page(page)
                             logger.debug(article)
                             logger.debug(filename)
                             
                             #get filename
                             name=""
                             str_name = filename.split("page")
                             try:
                                 name = str_name[0]
                             except:
                                 name = filename
                             mag.set_name(name)
                             if article != None: #If article exists
                                 if len(doc)<5 and len(values[-1:])<3900:
                                     logger.info("SMALL "+str(len(doc)) + " = "+doc)
                                     pass
                                 if len(doc)>0:
                                     self.setLengths(len(doc))
                                     #split value in case too long to process
                                     if len(doc)>3000:
                                         sentences=""
                                         splitted = doc.split(' ')
                                         for split in splitted:
                                             lenn = len(sentences) + len(split) + 1 #+1 for the space
                                             if lenn > 3000:
                                                 article.addText(sentences, page)
                                                 sentences = ""
                                             sentences += split+" "
                                         if len(sentences) > 0:
                                             article.addText(sentences, page)
                                     else:
                                         article.addText(doc, page)
                                         article.set_len(len(doc))
                             else:
                                 #in case we cannot find article
                                 article = Article(filename, page, "")
                                 article.addText(doc, page)
                                 article.set_len(len(doc))
                                 self.setLengths(len(doc))
                                 mag.add_article(article)    
                         else:
                                 #if article not found for the document, store pages page by page
                             print("stored file "+filename+" as article was not found")
                             article = Article(filename, page, "")
                             article.addText(doc, page)
                             article.set_len(len(doc))
                             mag.add_article(article) 
                             self.setLengths(len(doc))
                             
                         magazines[fileId] = mag 
                         #mag.log_articles_and_contents()
                         
                 elif filename.endswith(".xml"):
                     page_count = 1+page_count
                     article = Article(filename, page, "")
                     doc, document = self.readDocument(path, folder, filename, document)
                     if len(doc)>0:
                         xml = xmlParser(input_file=doc)
                         if bool(BeautifulSoup(html, "html.parser").find()) == True:
                             self.setLengths(len(doc))
                             article.set_len(len(doc))
                             html = htmlParser(doc)
                             article = self.split_document(html.get_text(), article, page_count)
                             result.append(article)
                     else:
                         #process
                         pass
                 elif filename.endswith(".html"):
                     page_count = 1+page_count
                     article = Article(filename, page, "")
                      
                     doc, document = self.readDocument(path, folder, filename, document)
                     if len(doc)>0:
                         self.setLengths(len(doc))
                         article.set_len(len(doc))
                         html = htmlParser(doc)
                         article = self.split_document(html.get_text(), article, page_count)
                         result.append(article)
                     else:
                         #process
                         pass
             dest.close()              
                  
     result = []
     logger.debug("VALUES FOR magazines "+str(len(listOfMagazines)))
     for id in listOfMagazines:
         if magazines[id] not in result:
             result.append(magazines[id])
     return result, listOfMagazines