def htmls_to_docs(urls, htmls, text_params): """ Given a list of urls and a dictionary with text preprocessing parameters, return a list of extracted text for each html using. """ docs = [] if text_params['cache_docs']: db = shelve.open(text_params['docs_shelve']) for url, html in zip(urls, htmls): if text_params['cache_docs']: if str(url) in db: docs.append(db[str(url)]) else: doc = text_extraction.extract_texts( html, title_weight=text_params['title_weight'], header_weights=text_params['header_weights'], use_pdf=text_params['use_pdf'], use_stmmer=text_params['use_stemmer'], ukkonen_len=text_params['ukkonen_len'], homepage_weight=text_params['homepage_weight']) doc = ' '.join(doc) docs.append(doc) db[str(url)] = doc if text_params['cache_docs']: db.close() return docs
def __test_site_kw(url, kws): if len(kws) > 0: html = crawl.extract_html_rec(url, max_depth=4) txt = text_extraction.extract_texts(html) txts = ' '.join(txt) failed = False for kw in kws: if txts.find(text_extraction.clean_text(kw).lower()) == -1: print 'error, keyword not found in txt', url, kw failed = True if not failed: print 'OK', url
def htmls2docs(self, htmls): """ Convert HTMLs to text documents using the params specified in `text_params`. """ docs = [] for i, html in enumerate(htmls): doc = extract_texts( html, title_weight=self.text_params['title_weight'], header_weights=self.text_params['header_weights'], use_pdf=self.text_params['use_pdf'], use_stemmer=self.text_params['use_stemmer'], ukkonen_len=self.text_params['ukkonen_len'], homepage_weight=self.text_params['homepage_weight']) docs.append(' '.join(doc)) return docs