Python fetch_and_clean Exemples, utils.fetch_and_clean.fetch_and_clean Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : views.py Projet : borpglass/churnalism_us

def fetch_and_store(url):
    try:
        (title, text) = fetch_and_clean(url)
    except Exception:
        raise Http404(url)

    doc = SearchDocument()
    doc.url = url
    doc.title = title
    doc.text = text
    doc.save()
    return doc

Exemple #2

0

Afficher le fichier

Fichier : views.py Projet : jackphelps/churnalism_us

def fetch_and_store(url):
    try:
        (title, text) = fetch_and_clean(url)
    except Exception:
        raise Http404(url)

    doc = SearchDocument()
    doc.url = url
    doc.title = title
    doc.text = text
    doc.save()
    return doc

Exemple #3

0

Afficher le fichier

def search_against_url(request, url):
    """
    Accepts a URL as either a suffix of the URI or a POST request
    parameter. Downloads the content, feeds it through the
    readability article grabber, then submits the article text
    to superfastmatch for comparison.
    """

    (scheme, _1, _2, _3, _4, _5) = urlparse(url)
    if scheme not in ('http', 'https'):
        return search_page(
            request, error='The URL must begin with either http or https.')

    sfm = from_django_conf('sidebyside')
    try:
        (title, text) = fetch_and_clean(url)
    except requests.exceptions.Timeout:
        return search_page(
            request, error="Sorry, that news article couldn't be retrieved.")

    try:
        sfm_results = sfm.search(text=text, title=title, url=url)
        drop_silly_results(sfm_results)
        sort_by_coverage(sfm_results)

        #if they submit a url, don't return the exact same url in the results
        for r in sfm_results['documents']['rows']:
            if r.get('url') == url:
                sfm_results['documents']['rows'].remove(r)

        if sfm_results.has_key('text'): text = sfm_results['text']
        else: text = ''

        if sfm_results.has_key('title'): title = sfm_results['title']
        else: title = 'No Title'

        return search_result_page(request,
                                  sfm_results,
                                  text,
                                  source_title=title,
                                  source_url=url)
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            raise HttpResponse('No such article {0}'.format(url))
        elif settings.DEBUG == True:
            return HttpResponse(e.response[1], status=e.response[0])
        else:
            raise

Exemple #4

0

Afficher le fichier

Fichier : views.py Projet : borpglass/churnalism_us

def search_against_url(request, url):
    """
    Accepts a URL as either a suffix of the URI or a POST request
    parameter. Downloads the content, feeds it through the
    readability article grabber, then submits the article text
    to superfastmatch for comparison.
    """

    (scheme, _1, _2, _3, _4, _5) = urlparse(url)
    if scheme not in ('http', 'https'):
        return search_page(request, error='The URL must begin with either http or https.')

    sfm = from_django_conf('sidebyside')
    try:
        (title, text) = fetch_and_clean(url)
    except requests.exceptions.Timeout:
        return search_page(request, error="Sorry, that news article couldn't be retrieved.")

    try:
        sfm_results = sfm.search(text=text, title=title, url=url)
        drop_silly_results(sfm_results)
        sort_by_coverage(sfm_results)


        #if they submit a url, don't return the exact same url in the results
        for r in sfm_results['documents']['rows']:
            if r.get('url') == url:
                sfm_results['documents']['rows'].remove(r)

        if sfm_results.has_key('text'): text = sfm_results['text']
        else: text = ''

        if sfm_results.has_key('title'): title = sfm_results['title']
        else: title='No Title'

        return search_result_page(request, sfm_results, text,
                                  source_title=title, source_url=url)
    except superfastmatch.SuperFastMatchError, e:
        if e.status == httplib.NOT_FOUND:
            raise HttpResponse('No such article {0}'.format(url))
        elif settings.DEBUG == True:
            return HttpResponse(e.response[1], status=e.response[0])
        else:
            raise