def generate_text_files(): """ Reads from list generated by parse_readings(). Exports the text of each reading to standalone text files """ with open('%s/%s' % (data_dir, readings_file), 'r') as jsonfile: readings = json.loads(jsonfile.read()) for reading in readings: filename = vfn(reading['url'], initCap=False).decode() # Output a directory of text files generated by newspaper with open( '%s/%s/%s/%s.txt' % (data_dir, readings_text_dir, 'newspaper_parse', filename), 'w') as htmlfile: htmlfile.write(reading['page']['n_text']) # Output a directory of text files generated by boilerpipe with open( '%s/%s/%s/%s.txt' % (data_dir, readings_text_dir, 'boilerpipe_parse', filename), 'w') as htmlfile: htmlfile.write(reading['page']['b_text'])
def scrape_readings(): """ Reads from the list generated by parse_course(). Grabs a copy of each page. Writes it to HTML file in data/readings/html. """ # Use the json list of readings if it exists try: with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile: readings = json.loads(jsonfile.read()) # otherwise, generate it except FileNotFoundError: parse_course() with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile: readings = json.loads(jsonfile.read()) for reading in readings: # For each one, get the inspection detail URL. r = requests.get(reading['url']) # Use goldfinch to make a valid filename from the URL filename = vfn(reading['url'], initCap=False).decode() # Make a local copy of the page with open('%s/%s/%s' % (data_dir, readings_html_dir, filename), 'w') as htmlfile: htmlfile.write(r.text) # Print something to stdout. print('Saved page %s: %s' % (reading['id'], reading['url']))
def creator(url): """ Creates the full path to TimeMap bases on save_dir, url and args configuration :param str url: URL to be filenamified :return: Path to TimeMap """ sanity = vfn(url.replace('://', '_').replace('/', '_'), initCap=False).decode("utf-8") return os.path.join(dump_dir, sanity + the_ext)
def sane_filename(url): """ Turns a url into a sane filename. Replaces :// and / with _ then feeds url to goldfinch.validFileName :param str url: The url to turn into a valid filename :return str: A valid filename from the supplied url """ return vfn(url.replace('://', '_').replace('/', '_'), initCap=False).decode("utf-8")
def getImage(self, playlist, image_url): """Gets an image from the given url, resizes it if necessary to fit our dimensions, saves the resized image to static/images/songs/playlist_name.fileformat, and returns the directory it was saved at.""" image_max = 600 r = requests.get(image_url) i = Image.open(BytesIO(r.content)) orig_width = i.size[0] orig_height = i.size[1] if orig_width > image_max or orig_height > image_max: print("resize") i.thumbnail((image_max, image_max), Image.ANTIALIAS) filename = os.path.join( images_location, vfn(playlist, initCap=False, ascii=False).decode("UTF-8")) i.save(filename, i.format) return filename
def filenamify(url, rcolslash='_', space='underscore', initCap=True, ascii=True): """ Filenamifies a URL using goldfish :param url: The URL to filenamify :param rcolslash: Replacement string for removing "://" and "/" :param space: 'underscore', 'remove',or 'keep' :param initCap: True or False :param ascii: True or False :return: The filenamified URL """ return vfn(url.replace('://', rcolslash).replace('/', rcolslash), space=space, initCap=initCap, ascii=ascii).decode("utf-8")
def get_img_list(self): """ Gets list of images from the page_html. """ tree = html.fromstring(self.page_html) img = tree.xpath('//img/@src') links = tree.xpath('//a/@href') img_list = self.process_links(img) img_links = self.process_links(links) img_list.extend(img_links) if self.useTitle: title = tree.xpath('//title') if title: title = vfn(title[0].text) title = title.encode('ascii', 'ignore') if title: self.download_path = os.path.join(self.download_path, title) if self.filename_pattern: # Compile pattern for efficiency pattern = re.compile(self.filename_pattern) # Verifies filename in the image URL matches pattern def matches_pattern(img_url): """ Function to check if pattern is matched. """ img_filename = urlparse(img_url).path.split('/')[-1] return pattern.search(img_filename) images = [urljoin(self.url, img_url) for img_url in img_list if matches_pattern(img_url)] else: images = [urljoin(self.url, img_url) for img_url in img_list] images = list(set(images)) self.images = images if self.scrape_reverse: self.images.reverse() return self.images
def normalizePath(input): return vfn(input, space="keep", initCap=False).decode('utf-8').rstrip(".")
def npath(path): return vfn(path, space='keep', initCap=False).decode('utf-8').rstrip('.')
def parse_readings(): """ Reads from list generated by parse_course(). Reads each readings page from scrape_readings(). Parses the HTML. Writes to JSON. """ # Use the json list of readings if it exists try: with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile: readings = json.loads(jsonfile.read()) # otherwise, generate it except FileNotFoundError: parse_course() with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile: readings = json.loads(jsonfile.read()) # Create lists to hold readings reading_list = [] pdf_list = [] error_list = [] for reading in readings: # Skip pdf files if '.pdf' in reading['url']: pdf_list.append(reading) else: # Container for parsed data reading_item = {} # Use goldfinch to make a valid filename from the URL filename = vfn(reading['url'], initCap=False).decode() # Initialize a newspaper article # url is empty because we don't need newspaper to do any scraping # but it's a required property article = Article(url='') # Open the local version of the HTML file try: with open('%s/%s/%s' % (data_dir, readings_html_dir, filename), 'r') as htmlfile: # Save both the raw html and add it to the article raw_html = htmlfile.read() article.set_html(raw_html) except FileNotFoundError: print('Error reading saved html file') # Use newspaper to do the parsing article.parse() reading_item['title'] = article.title reading_item['authors'] = article.authors # Set iso string version of date if it exists. # It needs to be a string because we'll be exporting to JSON reading_item['pub_date'] = article.publish_date.isoformat() \ if article.publish_date else None # Usually newspaper's extractor works best reading_item['n_text'] = article.text # But when it fails, we may want to use boilerpipe extraction as # a fallback extractor = Extractor(extractor='ArticleExtractor', html=raw_html) reading_item['b_text'] = extractor.getText() # print('Newspaper words: %s' % len(reading_item['n_text'].split())) # print('Boilerpipe words: %s' % len(reading_item['b_text'].split())) # if(reading_item['text'] == ''): # extractor = Extractor(extractor='ArticleExtractor', html=raw_html) # reading_item['text'] = extractor.getText() # Add the parsed data to our existing reading data reading['page'] = reading_item # Note failed parses if (reading_item['n_text'] == '' and reading_item['b_text'] == ''): print('Could not parse text for %s' % reading['url']) error_list.append(reading) else: reading_list.append(reading) print('Sucessfully parsed readings: %s' % len(error_list)) print('Skipped PDF readings: %s' % len(pdf_list)) print('Articles without parseable text: %s' % len(error_list)) # print('Articles without authors: %s' % len([ # reading for reading in reading_list # if reading['page']['authors'] == []])) # print('Articles without dates: %s' % len([ # reading for reading in reading_list # if reading['page']['pub_date'] is None])) # Write to json file with open('%s/%s' % (data_dir, readings_file), 'w') as jsonfile: jsonfile.write(json.dumps(reading_list))
def filename_from_url(url): ''' Converts a url into a friendlier file name''' # return 'cache/' + re.sub(r'://|/', '_', url) # convert to hash (md5) to avoid looooongcat urls causing FS failures if filename is 'too long' (255 chars is not uncommon) return 'cache/' + hashlib.md5(vfn(url)).hexdigest()
def filename_from_url(url): ''' Converts a url into a friendlier file name''' # return 'cache/' + re.sub(r'://|/', '_', url) return 'cache/' + vfn(url)
def makeFileFriendly(fileName): return vfn(string.capwords(fileName), space="keep").decode('utf-8')