def memoize(links, domain): maintain_directory() if not os.path.exists(os.path.join(os.getcwd(), MEMO_DIRECTORY)): os.mkdir(MEMO_DIRECTORY) os.chdir(MEMO_DIRECTORY) # Else, we are already in the memoized articles file memo = {link.href : link for link in links} # If we are on our 2nd run or above, we begin to memoize if os.path.exists(domain_to_key(domain)): # r+ means open to reading and writing, file is not truncated down file_obj = open(domain_to_key(domain), "r+") # Python automatically handles platform differences \n handles \r\n also # Do not refactor into file_obj.read().split("\n"), for whatever reason, # that gives an empty string as output, as some weird bug? saved_links = file_obj.read() # Chop off the last element, it's just an empty string saved_links = saved_links.split("\n")[:-1] file_obj.close() for link in saved_links: if memo.get(link): # If the link lasts so long on a page, it's not news del memo[link] text = "" for link in memo.keys(): text += link + "\n" # Override the txt file with a new list of links, for next time write_unicode_to_file(domain_to_key(domain) ,safe_unicode(text), "w") # Construct the new list of links (objects) survived_links = memo.values() return survived_links
def extract_data(link, get_objects): maintain_directory() try: html = urllib2.urlopen(link.href).read() except Exception: return # Bust link, keep going obj = MaxSubSequence(html) txt, title = obj.MaxSubSequence(), "" if txt is None: return # Beautiful Soup unicode error we need to account for title = obj.getTitle() if link.title is None: # If the title was not captured in the seeker, try once more link.title = title # Create txt file with link, title, date, body, all in UNICODE total = PROPERTY_DELIMITER.join([link.href, safe_unicode(title), safe_unicode(txt)]) + ARTICLE_DELIMITER article_file = codecs.open(SAVED_DIRECTORY, "a+", "utf-8") article_file.write(total) article_file.close() if get_objects: link.text = txt return link