Exemple #1
0
def generate_text_files():
    """
    Reads from list generated by parse_readings().
    Exports the text of each reading to standalone text files
    """

    with open('%s/%s' % (data_dir, readings_file), 'r') as jsonfile:
        readings = json.loads(jsonfile.read())

    for reading in readings:
        filename = vfn(reading['url'], initCap=False).decode()

        # Output a directory of text files generated by newspaper
        with open(
                '%s/%s/%s/%s.txt' %
            (data_dir, readings_text_dir, 'newspaper_parse', filename),
                'w') as htmlfile:
            htmlfile.write(reading['page']['n_text'])

        # Output a directory of text files generated by boilerpipe
        with open(
                '%s/%s/%s/%s.txt' %
            (data_dir, readings_text_dir, 'boilerpipe_parse', filename),
                'w') as htmlfile:
            htmlfile.write(reading['page']['b_text'])
Exemple #2
0
def scrape_readings():
    """
    Reads from the list generated by parse_course().
    Grabs a copy of each page.
    Writes it to HTML file in data/readings/html.
    """
    # Use the json list of readings if it exists
    try:
        with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile:
            readings = json.loads(jsonfile.read())
    # otherwise, generate it
    except FileNotFoundError:
        parse_course()
        with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile:
            readings = json.loads(jsonfile.read())

    for reading in readings:
        # For each one, get the inspection detail URL.
        r = requests.get(reading['url'])

        # Use goldfinch to make a valid filename from the URL
        filename = vfn(reading['url'], initCap=False).decode()

        # Make a local copy of the page
        with open('%s/%s/%s' % (data_dir, readings_html_dir, filename),
                  'w') as htmlfile:
            htmlfile.write(r.text)

        # Print something to stdout.
        print('Saved page %s: %s' % (reading['id'], reading['url']))
Exemple #3
0
 def creator(url):
     """
     Creates the full path to TimeMap bases on save_dir, url and args configuration
     :param str url: URL to be filenamified
     :return: Path to TimeMap
     """
     sanity = vfn(url.replace('://', '_').replace('/', '_'),
                  initCap=False).decode("utf-8")
     return os.path.join(dump_dir, sanity + the_ext)
def sane_filename(url):
    """
    Turns a url into a sane filename. Replaces :// and / with _
    then feeds url to goldfinch.validFileName
    :param str url: The url to turn into a valid filename
    :return str: A valid filename from the supplied url
    """
    return vfn(url.replace('://', '_').replace('/', '_'),
               initCap=False).decode("utf-8")
Exemple #5
0
 def getImage(self, playlist, image_url):
     """Gets an image from the given url, resizes it if necessary to fit our dimensions,
     saves the resized image to static/images/songs/playlist_name.fileformat, and returns
     the directory it was saved at."""
     image_max = 600
     r = requests.get(image_url)
     i = Image.open(BytesIO(r.content))
     orig_width = i.size[0]
     orig_height = i.size[1]
     if orig_width > image_max or orig_height > image_max:
         print("resize")
         i.thumbnail((image_max, image_max), Image.ANTIALIAS)
     filename = os.path.join(
         images_location,
         vfn(playlist, initCap=False, ascii=False).decode("UTF-8"))
     i.save(filename, i.format)
     return filename
def filenamify(url,
               rcolslash='_',
               space='underscore',
               initCap=True,
               ascii=True):
    """
    Filenamifies a URL using goldfish
    :param url:  The URL to filenamify
    :param rcolslash: Replacement string for removing "://" and "/"
    :param space:  'underscore', 'remove',or 'keep'
    :param initCap: True or False
    :param ascii:  True or False
    :return: The filenamified URL
    """
    return vfn(url.replace('://', rcolslash).replace('/', rcolslash),
               space=space,
               initCap=initCap,
               ascii=ascii).decode("utf-8")
Exemple #7
0
    def get_img_list(self):
        """ Gets list of images from the page_html. """
        tree = html.fromstring(self.page_html)
        img = tree.xpath('//img/@src')
        links = tree.xpath('//a/@href')
        img_list = self.process_links(img)
        img_links = self.process_links(links)
        img_list.extend(img_links)

        if self.useTitle:
            title = tree.xpath('//title')
            if title:
                title = vfn(title[0].text)
                title = title.encode('ascii', 'ignore')
                if title:
                    self.download_path = os.path.join(self.download_path, title)

        if self.filename_pattern:
            # Compile pattern for efficiency
            pattern = re.compile(self.filename_pattern)

            # Verifies filename in the image URL matches pattern
            def matches_pattern(img_url):
                """ Function to check if pattern is matched. """

                img_filename = urlparse(img_url).path.split('/')[-1]
                return pattern.search(img_filename)

            images = [urljoin(self.url, img_url) for img_url in img_list
                      if matches_pattern(img_url)]
        else:
            images = [urljoin(self.url, img_url) for img_url in img_list]

        images = list(set(images))
        self.images = images
        if self.scrape_reverse:
            self.images.reverse()
        return self.images
Exemple #8
0
def normalizePath(input):
    return vfn(input, space="keep", initCap=False).decode('utf-8').rstrip(".")
def npath(path):
    return vfn(path, space='keep', initCap=False).decode('utf-8').rstrip('.')
Exemple #10
0
def parse_readings():
    """
    Reads from list generated by parse_course().
    Reads each readings page from scrape_readings().
    Parses the HTML.
    Writes to JSON.
    """

    # Use the json list of readings if it exists
    try:
        with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile:
            readings = json.loads(jsonfile.read())
    # otherwise, generate it
    except FileNotFoundError:
        parse_course()
        with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile:
            readings = json.loads(jsonfile.read())

    # Create lists to hold readings
    reading_list = []
    pdf_list = []
    error_list = []

    for reading in readings:
        # Skip pdf files
        if '.pdf' in reading['url']:
            pdf_list.append(reading)

        else:
            # Container for parsed data
            reading_item = {}

            # Use goldfinch to make a valid filename from the URL
            filename = vfn(reading['url'], initCap=False).decode()

            # Initialize a newspaper article
            # url is  empty because we don't need newspaper to do any scraping
            # but it's a required property
            article = Article(url='')

            # Open the local version of the HTML file
            try:
                with open('%s/%s/%s' % (data_dir, readings_html_dir, filename),
                          'r') as htmlfile:
                    # Save both the raw html and add it to the article
                    raw_html = htmlfile.read()
                    article.set_html(raw_html)
            except FileNotFoundError:
                print('Error reading saved html file')

            # Use newspaper to do the parsing
            article.parse()

            reading_item['title'] = article.title
            reading_item['authors'] = article.authors

            # Set iso string version of date if it exists.
            # It needs to be a string because we'll be exporting to JSON
            reading_item['pub_date'] = article.publish_date.isoformat() \
                if article.publish_date else None

            # Usually newspaper's extractor works best
            reading_item['n_text'] = article.text

            # But when it fails, we may want to use boilerpipe extraction as
            # a fallback
            extractor = Extractor(extractor='ArticleExtractor', html=raw_html)
            reading_item['b_text'] = extractor.getText()

            # print('Newspaper words: %s' % len(reading_item['n_text'].split()))
            # print('Boilerpipe words: %s' % len(reading_item['b_text'].split()))

            # if(reading_item['text'] == ''):
            #     extractor = Extractor(extractor='ArticleExtractor', html=raw_html)
            #     reading_item['text'] = extractor.getText()

            # Add the parsed data to our existing reading data
            reading['page'] = reading_item

            # Note failed parses
            if (reading_item['n_text'] == '' and reading_item['b_text'] == ''):
                print('Could not parse text for %s' % reading['url'])
                error_list.append(reading)
            else:
                reading_list.append(reading)

    print('Sucessfully parsed readings: %s' % len(error_list))

    print('Skipped PDF readings: %s' % len(pdf_list))

    print('Articles without parseable text: %s' % len(error_list))

    # print('Articles without authors: %s' % len([
    #     reading for reading in reading_list
    #     if reading['page']['authors'] == []]))

    # print('Articles without dates: %s' % len([
    #     reading for reading in reading_list
    #     if reading['page']['pub_date'] is None]))

    # Write to json file
    with open('%s/%s' % (data_dir, readings_file), 'w') as jsonfile:
        jsonfile.write(json.dumps(reading_list))
def filename_from_url(url):
    ''' Converts a url into a friendlier file name'''

    # return 'cache/' + re.sub(r'://|/', '_', url)
    # convert to hash (md5) to avoid looooongcat urls causing FS failures if filename is 'too long' (255 chars is not uncommon)
    return 'cache/' + hashlib.md5(vfn(url)).hexdigest()
Exemple #12
0
def filename_from_url(url):
    ''' Converts a url into a friendlier file name'''

    # return 'cache/' + re.sub(r'://|/', '_', url)
    return 'cache/' + vfn(url)
Exemple #13
0
 def makeFileFriendly(fileName):
     return vfn(string.capwords(fileName), space="keep").decode('utf-8')