def test_extract(self): with open('webarticle2text/fixtures/SAMPLE1.html', 'rb') as fin: raw = fin.read() ret = webarticle2text.extractFromHTML(raw) print(ret) self.assertTrue(ret.startswith('Python 3.6.0 is now available!')) self.assertTrue(ret.endswith('is now available for download on python.org.'))
def test_extract(self): with open('webarticle2text/fixtures/SAMPLE1.html', 'rb') as fin: raw = fin.read() ret = webarticle2text.extractFromHTML(raw) print(ret) self.assertTrue(ret.startswith('Python 3.6.0 is now available!')) self.assertTrue( ret.endswith('is now available for download on python.org.'))
def article_summary(url): r = op.open(url) title = op.title() raw_html = r.read() clean_html = filter(lambda x : x in string.printable, raw_html) main_text = webarticle2text.extractFromHTML(clean_html) sentences = sent_detector.tokenize(main_text) # return "<body> <h1>%s</h1> <p><ol><li>%s</li></ol></p></body>" % ( # title, # '</li><li>'.join(sentences)) return "<body> <h1>%s</h1> <p>%s</p></body>" % ( title, ' '.join(sentences))
def index(crawl_dir,index_dir,get_first_N=None): df_files = []; file_indexed_dict = {} ; index = None if os.path.isfile(index_dir + "/loogle_files.csv" ): df = pd.read_csv(index_dir + "/loogle_files.csv") df_files = df[['file','size']].values.tolist() file_indexed_dict = df.set_index('file').to_dict()['size'] index = open_dir(index_dir) else: pdf_schema = Schema(path = ID(stored=True), title = TEXT(stored=True), text = TEXT) index = create_in(index_dir, pdf_schema) writer = index.writer() # get all potential files to be indexed dirs, files = rsync.ls(crawl_dir) files = [(f.replace("\\","/"),size) for (f,size) in files ] files = [(f,size) for (f,size) in files if os.path.splitext(f)[1] in exts] files_crawled_dict = dict(files) tmp = {} # needed bcz cannot change file_indexed_dict while iterating for ff in file_indexed_dict: # remove file from index if file exists in index, but not on # file system if ff not in files_crawled_dict: print ff, 'removed' writer.delete_by_term('path', unicode(ff)) tmp[ff] = file_indexed_dict[ff] elif files_crawled_dict[ff] != file_indexed_dict[ff]: # this is the only section we do not add to tmp this is # how I remove an updated file from my index dictionary so # its absence will be detected below, and will be freshly # reindexed print ff, 'size different update' writer.delete_by_term('path', unicode(ff)) else: tmp[ff] = file_indexed_dict[ff] # put it back in file_indexed_dict = tmp if get_first_N: files = files[:get_first_N] for i,(file,size) in enumerate(files): if file in file_indexed_dict: print 'skipping', file continue print 'processing', file ext = os.path.splitext(file)[1] if ext == ".pdf" : cmd = pdfcmd % (file,os.environ['TEMP']) os.system(cmd) elif ext == ".djvu": cmd = djvucmd % (file,os.environ['TEMP']) os.system(cmd) os.system("unix2dos -7 %s/loog.txt" % os.environ['TEMP']) elif ext == ".html": with codecs.open(file, encoding='utf-8') as f: content = f.read() content = webarticle2text.extractFromHTML(content) fout = open("%s/loog.txt" % os.environ['TEMP'],"w") fout.write(content.encode("utf8")) fout.close() elif ext == ".txt": shutil.copy(file, "%s/loog.txt" % os.environ['TEMP']) # turn the file name itself into content as well just in case, # if text conversion does not output anything, at least we can # use some information from the file name for search filename_as_content = os.path.basename(file).replace("_"," ").replace("-"," ") filename_as_content = filename_as_content.decode("latin-1") for x in exts: filename_as_content = filename_as_content.replace(x,"") filename_as_content += " " with codecs.open("%s/loog.txt" % os.environ['TEMP'], encoding='utf-8') as f: content = f.read() writer.add_document(path = unicode(file), title = unicode(filename_as_content), text = unicode(content)) df_files.append([file, size]) writer.commit() df_files = pd.DataFrame(df_files,columns=['file','size']) df_files.to_csv(index_dir + "/loogle_files.csv",index=None)
def extract_html_content(self, content=None): """Extract the most interesting content from the html""" if not content: content = self.content content = webarticle2text.extractFromHTML(content) return content
def index(crawl_dir, index_dir, get_first_N=None): df_files = [] file_indexed_dict = {} index = None if os.path.isfile(index_dir + "/loogle_files.csv"): df = pd.read_csv(index_dir + "/loogle_files.csv") df_files = df[['file', 'size']].values.tolist() file_indexed_dict = df.set_index('file').to_dict()['size'] index = open_dir(index_dir) else: pdf_schema = Schema(path=ID(stored=True), title=TEXT(stored=True), text=TEXT) index = create_in(index_dir, pdf_schema) writer = index.writer() # get all potential files to be indexed dirs, files = rsync.ls(crawl_dir) files = [(f.replace("\\", "/"), size) for (f, size) in files] files = [(f, size) for (f, size) in files if os.path.splitext(f)[1] in exts] files_crawled_dict = dict(files) tmp = {} # needed bcz cannot change file_indexed_dict while iterating for ff in file_indexed_dict: # remove file from index if file exists in index, but not on # file system if ff not in files_crawled_dict: print ff, 'removed' writer.delete_by_term('path', unicode(ff)) tmp[ff] = file_indexed_dict[ff] elif files_crawled_dict[ff] != file_indexed_dict[ff]: # this is the only section we do not add to tmp this is # how I remove an updated file from my index dictionary so # its absence will be detected below, and will be freshly # reindexed print ff, 'size different update' writer.delete_by_term('path', unicode(ff)) else: tmp[ff] = file_indexed_dict[ff] # put it back in file_indexed_dict = tmp if get_first_N: files = files[:get_first_N] for i, (file, size) in enumerate(files): if file in file_indexed_dict: print 'skipping', file continue print 'processing', file ext = os.path.splitext(file)[1] if ext == ".pdf": cmd = pdfcmd % (file, os.environ['TEMP']) os.system(cmd) elif ext == ".djvu": cmd = djvucmd % (file, os.environ['TEMP']) os.system(cmd) os.system("unix2dos -7 %s/loog.txt" % os.environ['TEMP']) elif ext == ".html": with codecs.open(file, encoding='utf-8') as f: content = f.read() content = webarticle2text.extractFromHTML(content) fout = open("%s/loog.txt" % os.environ['TEMP'], "w") fout.write(content.encode("utf8")) fout.close() elif ext == ".txt": shutil.copy(file, "%s/loog.txt" % os.environ['TEMP']) # turn the file name itself into content as well just in case, # if text conversion does not output anything, at least we can # use some information from the file name for search filename_as_content = os.path.basename(file).replace("_", " ").replace( "-", " ") filename_as_content = filename_as_content.decode("latin-1") for x in exts: filename_as_content = filename_as_content.replace(x, "") filename_as_content += " " with codecs.open("%s/loog.txt" % os.environ['TEMP'], encoding='utf-8') as f: content = f.read() writer.add_document(path=unicode(file), title=unicode(filename_as_content), text=unicode(content)) df_files.append([file, size]) writer.commit() df_files = pd.DataFrame(df_files, columns=['file', 'size']) df_files.to_csv(index_dir + "/loogle_files.csv", index=None)