Beispiel #1
0
from collections import namedtuple
import json
import re

from bs4 import BeautifulSoup as BS

from common import (
    datapath,
    log,
    mkdata,
    mkvariant,
    sanitise_ws,
)

cachepath = datapath('YouTube.html')

SEARCH_URL = ('https://www.youtube.com/results?'
              'gl={y.country}&persist_gl=1&search_query={{query}}')
SUGGEST_URL = ('https://suggestqueries.google.com/complete/search?'
               'client=firefox&ds=yt&hl={y.lang}&q={{query}}')

# superset of Lang
YT = namedtuple('YT', 'name lang country')


def html():
    """Encoded HTML data from URL or cache (if it exists).

    Returns:
        str: Raw bytes returned from URL/file
Beispiel #2
0
#
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 2017-02-05
#

"""Generate eBay engine JSON."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import csv
import json

from common import datapath, mkdata, mkvariant
path = datapath('ebay-variants.tsv')

SEARCH_URL = 'https://www.ebay.{tld}/sch/i.html?_nkw={{query}}'
SUGGEST_URL = 'https://autosug.ebay.com/autosug?fmt=osr&sId={site}&kwd={{query}}'

Variant = namedtuple('Variant', 'site uid tld name')


def variants():
    """International eBay variants.

    Yields:
        Variant: eBay variant
    """
    with open(path) as fp:
        for line in csv.reader(fp, delimiter='\t'):
#
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 2017-02-06
#
"""Generate engine JSON for a Google search."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import csv
import json

from common import datapath, mkdata, mkvariant

path = datapath('google-languages.tsv')

Lang = namedtuple('Lang', 'id name')


def langs():
    """All languages supported by Google.

    Yields:
        Lang: Google languages
    """
    with open(path) as fp:
        for line in csv.reader(fp, delimiter='\t'):
            yield Lang(*[s.decode('utf-8') for s in line])

# Created on 2016-12-17
#

"""Generate Wikipedia engine JSON."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import json

from bs4 import BeautifulSoup as BS

from common import datapath, httpget, mkdata, mkvariant

url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
cachepath = datapath('Wikipedia.html')

# Ignore wikis whose article count is below...
MIN_ARTICLE_COUNT = 10000

SEARCH_URL = 'https://{l.code}.wikipedia.org/wiki/{{query}}'
SUGGEST_URL = ('https://{l.code}.wikipedia.org/w/api.php?'
               'action=opensearch&search={{query}}')

# superset of Lang
Wiki = namedtuple('Wiki', 'name code size')


def html():
    """Encoded HTML data from URL or cache (if it exists).
Beispiel #5
0
# Copyright (c) 2016 Dean Jackson <*****@*****.**>
#
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 2016-03-13
#
"""Output TSV list of ISO-639-1 language ``code,name``."""

from __future__ import print_function, absolute_import

from common import datapath, httpget, Lang, print_lang

from bs4 import BeautifulSoup as BS

url = 'https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes'
cachepath = datapath('ISO 3316-1 Country Codes.html')


def main():
    html = httpget(url, cachepath)
    soup = BS(html, 'html.parser')
    for row in soup.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) != 10:
            continue
        cells = cells[3:5]
        # print(cells)
        name, abbr = [e.get_text().strip() for e in cells]
        if len(abbr) != 2:
            continue
# Created on 2016-03-13
#

"""Wiktionary variants."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import json

from bs4 import BeautifulSoup as BS

from common import datapath, httpget, mkdata, mkvariant

url = 'https://www.wiktionary.org'
path = datapath('Wiktionary.html')

SEARCH_URL = 'https://{w.lang}.wiktionary.org/wiki/{{query}}'
SUGGEST_URL = 'https://{w.lang}.wiktionary.org/w/api.php?action=opensearch&search={{query}}'

Wiki = namedtuple('Wiki', 'id url lang name')


def html():
    """Wiktionary HTML.

    Returns:
        str: HTML at ``url``.
    """
    return httpget(url, path)
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 2016-03-12
#

"""Generate TSV of the languages supported by Google."""

from __future__ import print_function, absolute_import

from HTMLParser import HTMLParser

from common import datapath, httpget, Lang, print_lang

# Google's preferences pages
url = 'https://www.google.com/preferences'
cachepath = datapath('Google Prefs.html')


def html():
    return httpget(url, cachepath).decode('ISO-8859-1')


def parse_page(html):
    """Parse language id-name pairs from HTML.

    Args:
        html (unicode): Google's preferences page.

    Returns:
        list: Sequence of 2-tuples: `(id, name)`.
    """
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 2016-03-13
#

"""Output TSV list of ISO-639-1 language ``code,name``."""

from __future__ import print_function, absolute_import

from common import datapath, httpget, Lang, print_lang

from bs4 import BeautifulSoup as BS


url = 'https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes'
cachepath = datapath('ISO 3316-1 Country Codes.html')


def main():
    html = httpget(url, cachepath)
    soup = BS(html, 'html.parser')
    for row in soup.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) != 10:
            continue
        cells = cells[3:5]
        # print(cells)
        name, abbr = [e.get_text().strip() for e in cells]
        if len(abbr) != 2:
            continue
Beispiel #9
0
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 2017-02-06
#

"""Generate engine JSON for a Google search."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import csv
import json

from common import datapath, mkdata, mkvariant

path = datapath('google-languages.tsv')

Lang = namedtuple('Lang', 'id name')


def langs():
    """All languages supported by Google.

    Yields:
        Lang: Google languages
    """
    with open(path) as fp:
        for line in csv.reader(fp, delimiter='\t'):
            yield Lang(*[s.decode('utf-8') for s in line])

#
# Created on 2016-03-13
#
"""Wiktionary variants."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import json

from bs4 import BeautifulSoup as BS

from common import datapath, httpget, mkdata, mkvariant

url = 'https://www.wiktionary.org'
path = datapath('Wiktionary.html')

SEARCH_URL = 'https://{w.lang}.wiktionary.org/wiki/{{query}}'
SUGGEST_URL = 'https://{w.lang}.wiktionary.org/w/api.php?action=opensearch&search={{query}}'

Wiki = namedtuple('Wiki', 'id url lang name')


def html():
    """Wiktionary HTML.

    Returns:
        str: HTML at ``url``.
    """
    return httpget(url, path)
# Copyright (c) 2016 Dean Jackson <*****@*****.**>
#
# MIT Licence. See http://opensource.org/licenses/MIT
#
# Created on 2017-02-05
#
"""Generate Duck Duck Go engine JSON."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import csv
import json

from common import datapath, mkdata, mkvariant
path = datapath('ddg-variants.tsv')

SEARCH_URL = 'https://duckduckgo.com/?kp=-1&kz=-1&kl={kl}&q={{query}}'
SUGGEST_URL = 'https://duckduckgo.com/ac/?kp=-1&kz=-1&kl={kl}&q={{query}}'

Variant = namedtuple('Variant', 'id name')


def variants():
    """DDG variants from `ddg-variants.tsv`.

    Yields:
        Variant: DDG variant
    """
    with open(path) as fp:
        for line in csv.reader(fp, delimiter='\t'):
#
# Created on 2016-12-17
#
"""Generate Wikipedia engine JSON."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import json

from bs4 import BeautifulSoup as BS

from common import datapath, httpget, mkdata, mkvariant

url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
cachepath = datapath('Wikipedia.html')

# Ignore wikis whose article count is below...
MIN_ARTICLE_COUNT = 10000

SEARCH_URL = 'https://{l.code}.wikipedia.org/wiki/{{query}}'
SUGGEST_URL = ('https://{l.code}.wikipedia.org/w/api.php?'
               'action=opensearch&search={{query}}')

# superset of Lang
Wiki = namedtuple('Wiki', 'name code size')


def html():
    """Encoded HTML data from URL or cache (if it exists).
Beispiel #13
0
"""Generate Wikia engine JSON."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import json
import re
import sys

from bs4 import BeautifulSoup as BS

from common import datapath, httpget, mkdata, mkvariant

SOURCES = [
    ('http://community.wikia.com/wiki/Hub:Big_wikis',
     datapath('Wikia-Biggest.html')),
    ('http://community.wikia.com/wiki/Hub:Wikis_with_many_active_members',
     datapath('Wikia-Most-Active.html')),
    ('http://community.wikia.com/wiki/Hub:Sci-Fi', datapath('Wikia-SF.html')),
]

SEARCH_URL = 'http://{w.subdomain}.wikia.com/wiki/{{query}}'
SUGGEST_URL = ('http://{w.subdomain}.wikia.com/api.php?'
               'action=opensearch&search={{query}}')

Wiki = namedtuple('Wiki', 'name subdomain')

match = re.compile(r'http://(.+?)\..+').match


def log(s, *args):
Beispiel #14
0
"""Generate YouTube variants."""

from __future__ import print_function, absolute_import

from collections import namedtuple
import json
import re

from bs4 import BeautifulSoup as BS

from common import (
    datapath, log,
    mkdata, mkvariant, sanitise_ws,
)

cachepath = datapath('YouTube.html')

SEARCH_URL = ('https://www.youtube.com/results?'
              'gl={y.country}&persist_gl=1&search_query={{query}}')
SUGGEST_URL = ('https://suggestqueries.google.com/complete/search?'
               'client=firefox&ds=yt&hl={y.lang}&q={{query}}')

# superset of Lang
YT = namedtuple('YT', 'name lang country')


def html():
    """Encoded HTML data from URL or cache (if it exists).

    Returns:
        str: Raw bytes returned from URL/file