Example #1
0
def get_url_filename(url=None, driveid=None):
    url = url or 'https://drive.google.com/open?id={}'.format(driveid)
    if url.startswith('https://drive.google.com'):
        filename = get_url_title(url)
        if filename.endswith('Google Drive'):
            filename = filename[:-len('Google Drive')].rstrip().rstrip('-:').rstrip()
        return filename
Example #2
0
def infer_url_title(url):
    """ Guess what the page title is going to be from the path and FQDN in the URL

    >>> infer_url_title('https://ai.googleblog.com/2018/09/the-what-if-tool-code-free-probing-of.html')
    'the what if tool code free probing of'
    """
    meta = get_url_filemeta(url)
    title = ''
    if meta:
        if meta.get('hostname', url) == 'drive.google.com':
            title = get_url_title(url)
        else:
            title = meta.get('filename', meta['hostname']) or meta['hostname']
            title, fileext = splitext(title)
    else:
        logging.error('Unable to retrieve URL: {}'.format(url))
        return None
    return delimit_slug(title, ' ')
Example #3
0
def translate_line_footnotes(line, tag=None, default_title='<NOT_FOUND>'):
    r""" Find all bare-url footnotes, like "footnote:[moz.org]" and add a title like "footnote:[Moz (moz.org)]"

    >>> translate_line_footnotes('*Morphemes*:: Parts of tokens or words that contain meaning in and of themselves.'\
    ...     'footnote:[https://spacy.io/usage/linguistic-features#rule-based-morphology]')
    '*Morphemes*:: Parts of tokens or words that contain meaning in and of
     themselves.footnote:[See the web page titled "Linguistic Features : spaCy Usage Documentation"
     (https://spacy.io/usage/linguistic-features#rule-based-morphology).]'
    """
    line_urls = get_line_bad_footnotes(line, tag=tag)
    urls = line_urls[1:] if line_urls else []
    for url in urls:
        footnote = 'footnote:[{url}]'.format(url=url)
        new_footnote = footnote
        # TODO: use these to extract name from hyperlinks
        title = get_url_title(url)
        title = title or infer_url_title(url)
        title = (title or '').strip(' \t\n\r\f-_:|="\'/\\')
        title = title if ' ' in (title or 'X') else None

        if title:
            brief_title = title.split('\n')[0].strip().split(
                '|')[0].strip().split('Â')[0].strip().split('·')[0].strip()
            logging.info('URL: {}'.format(url))
            logging.info('TITLE: {}'.format(title))
            title = brief_title if len(brief_title) > 3 and len(
                title) > 55 else title
            title = title.replace('Â', '').replace('·', ':').replace(
                '|', ':').replace('\n', '--')
            logging.info('FINAL: {}'.format(title))
        title = title or default_title
        if title:
            new_footnote = 'footnote:[See the web page titled "{title}" ({url}).]'.format(
                title=(title or default_title), url=url)
        elif title is None:
            logging.error('Unable to find a title for url: {}'.format(url))
        else:
            new_footnote = 'footnote:[See the web page ({url}).]'.format(
                url=url)
        line = line.replace(footnote, new_footnote)

    return line