Example #1
0
def htmls_to_docs(urls, htmls, text_params):
    """ Given a list of urls and a dictionary with text preprocessing
        parameters, return a list of extracted text for each html using.

    """
    docs = []
    if text_params['cache_docs']:
        db = shelve.open(text_params['docs_shelve'])
    for url, html in zip(urls, htmls):
        if text_params['cache_docs']:
            if str(url) in db:
                docs.append(db[str(url)])
            else:
                doc = text_extraction.extract_texts(
                    html,
                    title_weight=text_params['title_weight'],
                    header_weights=text_params['header_weights'],
                    use_pdf=text_params['use_pdf'],
                    use_stmmer=text_params['use_stemmer'],
                    ukkonen_len=text_params['ukkonen_len'],
                    homepage_weight=text_params['homepage_weight'])
                doc = ' '.join(doc)
                docs.append(doc)
                db[str(url)] = doc
    if text_params['cache_docs']:
        db.close()
    return docs
Example #2
0
def __test_site_kw(url, kws):
    if len(kws) > 0:
        html = crawl.extract_html_rec(url, max_depth=4)
        txt = text_extraction.extract_texts(html)
        txts = ' '.join(txt)

        failed = False
        for kw in kws:
            if txts.find(text_extraction.clean_text(kw).lower()) == -1:
                print 'error, keyword not found in txt', url, kw
                failed = True
        if not failed:
            print 'OK', url
Example #3
0
 def htmls2docs(self, htmls):
     """ Convert HTMLs to text documents using the params specified in
         `text_params`.
     """
     docs = []
     for i, html in enumerate(htmls):
         doc = extract_texts(
             html,
             title_weight=self.text_params['title_weight'],
             header_weights=self.text_params['header_weights'],
             use_pdf=self.text_params['use_pdf'],
             use_stemmer=self.text_params['use_stemmer'],
             ukkonen_len=self.text_params['ukkonen_len'],
             homepage_weight=self.text_params['homepage_weight'])
         docs.append(' '.join(doc))
     return docs