Python extractFromURL Examples

Programming Language: Python

Namespace/Package Name: html2text

Method/Function: extractFromURL

Examples at hotexamples.com: 2

Python extractFromURL - 2 examples found. These are the top rated real world Python examples of html2text.extractFromURL extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: fetch_story_contents.py Project: AlexLerman/Story-Reads-Prediction

def _read_stories(input_file_path):
    """Return a list of stories with full contents and a dict with new IDs.

    Generate list elements in tuple form, where each element corresponds to a
    single line of the given processed stories log file.  Do not trim the
    newline off the end of the last element of the tuple.  Append a space
    followed by the full story contents to the title of each story.  Omit
    stories for which the full story contents could not be fetched.  Generate
    dict entries mapping from story IDs in the input file to story IDs in the
    output file.  Maintain the ordering of the input file in the list.  This
    ordering is equivalent to ascending order of both old story IDs and new
    story IDs.  Be warned that fetching full story contents is quite slow and
    consumes a great deal of bandwidth, so you or other users on your network
    may experience connectivity problems while executing this function.

    input_file_path, a str, is the file path to the processed Pulse stories log
    file that contains story URLs and titles but not the full contents of the
    stories themselves.
    """
    start_time = time.time()
    old_story_id = 0
    new_story_id = 0
    stories_list = []
    story_id_dict = {}
    story_contents_dict = {}
    socket.setdefaulttimeout(TIMEOUT_LENGTH)
    input_stream = open_safely(input_file_path)
    
    for story_as_str in input_stream:
        story_as_list = story_as_str.split(DELIMITER)
        story_url = story_as_list[NEW_STORIES_URL_INDEX]
        if story_url in story_contents_dict:
            story_contents = story_contents_dict[story_url]
        else:
            story_contents = html2text.extractFromURL(story_url)
            if (story_contents is not None) and \
                    (len(story_contents) <= MIN_STORY_LENGTH):
                story_contents = None
            story_contents_dict[story_url] = story_contents
        if story_contents is not None:
            story_as_list[NEW_STORIES_TITLE_INDEX] += " " + story_contents
            stories_list.append(tuple(story_as_list))
            story_id_dict[old_story_id] = new_story_id
            new_story_id += 1
        old_story_id += 1
        
    input_stream.close()
    num_stories_discarded = old_story_id - new_story_id
    discard_rate = float(100 * num_stories_discarded) / float(old_story_id)
    print(("Read a total of %d %s, %d (%.2f%%) of which were discarded " + \
           "because their full contents could not be fetched.") % \
           (old_story_id, STORIES_DESCRIPTOR, num_stories_discarded,
            discard_rate))
    report_time_elapsed(start_time)
    return (stories_list, story_id_dict)

Example #2

Show file

File: process_data.py Project: AlexLerman/Story-Reads-Prediction

def _insert_full_story(story, time_first_read, stories_dict,
                       story_contents_dict):
    story_url = story[OLD_STORIES_URL_INDEX]
    key = (story[OLD_STORIES_FEED_URL_INDEX],
           story[OLD_STORIES_FEED_TITLE_INDEX], story_url,
           story[OLD_STORIES_TITLE_INDEX])
    if key in stories_dict:
        value = stories_dict[key]
        value[0] = min(value[0], time_first_read)
    elif story_url in story_contents_dict:
        story_contents = story_contents_dict[story_url]
        if story_contents is not None:
            stories_dict[key] = [time_first_read, story_contents]
    else:
        pprint.pprint(key, sys.stderr)
        story_contents = html2text.extractFromURL(story_url)
        if (story_contents is not None) and \
                (len(story_contents) >= MIN_STORY_LENGTH):
            stories_dict[key] = [time_first_read, story_contents]
            story_contents_dict[story_url] = story_contents
        else:
            story_contents_dict[story_url] = None