def test_wikipedia_to_single_line_pages():
    lines = ["sdfds <page>line</page>"]

    pages = list(wp.wikipedia_to_single_line_pages(lines))

    assert pages == ["<page>line</page>"]

    lines = ["sdfds <page>line</page> dsfd <page>line2</page>"]

    pages = list(wp.wikipedia_to_single_line_pages(lines))
    print >>sys.stderr, "pages:", pages
    assert pages == ["<page>line</page>", "<page>line2</page>"]

    lines = """sdfds <page>line
            </page> dsfd <page>line2</page>""".split('\n')
    pages = list(wp.wikipedia_to_single_line_pages(lines))
    print >>sys.stderr, "lines:", lines
    print >>sys.stderr, "pages:", pages
    assert pages == ["<page>line</page>", "<page>line2</page>"]

    lines = ["xsdfs <page>line", "line2</page><page>hi</page><page>", "there</page>"]
    pages = list(wp.wikipedia_to_single_line_pages(lines))
    print >>sys.stderr, "lines:", lines
    print >>sys.stderr, "pages:", pages
    assert pages == ["<page>lineline2</page>", "<page>hi</page>", "<page>there</page>"]
def test_wikipedia_to_single_line_pages():
    lines = ["sdfds <page>line</page>"]

    pages = list(wp.wikipedia_to_single_line_pages(lines))

    assert pages == ["<page>line</page>"]

    lines = ["sdfds <page>line</page> dsfd <page>line2</page>"]

    pages = list(wp.wikipedia_to_single_line_pages(lines))
    print >> sys.stderr, "pages:", pages
    assert pages == ["<page>line</page>", "<page>line2</page>"]

    lines = """sdfds <page>line
            </page> dsfd <page>line2</page>""".split('\n')
    pages = list(wp.wikipedia_to_single_line_pages(lines))
    print >> sys.stderr, "lines:", lines
    print >> sys.stderr, "pages:", pages
    assert pages == ["<page>line</page>", "<page>line2</page>"]

    lines = [
        "xsdfs <page>line", "line2</page><page>hi</page><page>", "there</page>"
    ]
    pages = list(wp.wikipedia_to_single_line_pages(lines))
    print >> sys.stderr, "lines:", lines
    print >> sys.stderr, "pages:", pages
    assert pages == [
        "<page>lineline2</page>", "<page>hi</page>", "<page>there</page>"
    ]
def population_check(filename):
    with open(filename, 'r') as f:
        for page in wp.wikipedia_to_single_line_pages(f):
            population = wp.find_property(page, 'population_total')
            population = int(
                wp.extract_numeric(
                    population.replace(',', '').replace(' ', '')))
            print >> sys.stderr, "population:", population
Example #4
0
def main():
    usage = """
    python parse_weathrebox_templates wikipedia_dump.xml

    Create and output a table of weatherbox templates
    """
    num_args = 1
    parser = OptionParser(usage=usage)

    #parser.add_option('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str')
    #parser.add_option('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option')

    (options, args) = parser.parse_args()

    if len(args) < num_args:
        parser.print_help()
        sys.exit(1)

    if args[0] == '-':
        f = sys.stdin
    else:
        f = open(args[0], 'r')

    weatherbox_re = re.compile('Template:(.*[wW]eatherbox.*)')
    boxes = {}
    counter = 0
    wb_counter = 0

    for page in wp.wikipedia_to_single_line_pages(f):
        counter += 1
        title = wp.parse_title(page)

        match = weatherbox_re.match(title)
        if match is not None:
            wb_counter += 1
            template_title = match.groups(1)[0]

            weather_box = wp.parse_weather_box(page)

            if weather_box is not None:
                boxes[template_title] = weather_box

            print >> sys.stderr, "Weatherbox counter:", counter, wb_counter, title

    print json.dumps(boxes, indent=2)
def main():
    usage = """
    python parse_weathrebox_templates wikipedia_dump.xml

    Create and output a table of weatherbox templates
    """
    num_args= 1
    parser = OptionParser(usage=usage)

    #parser.add_option('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str')
    #parser.add_option('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option')

    (options, args) = parser.parse_args()

    if len(args) < num_args:
        parser.print_help()
        sys.exit(1)

    if args[0] == '-':
        f = sys.stdin
    else:
        f = open(args[0], 'r')

    weatherbox_re = re.compile('Template:(.*[wW]eatherbox.*)')
    boxes = {}
    counter = 0
    wb_counter = 0

    for page in wp.wikipedia_to_single_line_pages(f):
        counter += 1
        title = wp.parse_title(page)

        match = weatherbox_re.match(title) 
        if match is not None:
            wb_counter += 1
            template_title = match.groups(1)[0]

            weather_box = wp.parse_weather_box(page)

            if weather_box is not None:
                boxes[template_title] = weather_box

            print >>sys.stderr, "Weatherbox counter:", counter, wb_counter, title

    print json.dumps(boxes, indent=2)
def main():
    usage = """
    python scripts/query_coordinates.py dump.wiki

    Iterate over all pages with an infobox and query the Wikipedia API
    for the coordinates of this page.
    """
    num_args = 1
    parser = OptionParser(usage=usage)

    #parser.add_option('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str')
    #parser.add_option('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option')
    parser.add_option('-e',
                      '--existing',
                      dest='existing',
                      default=None,
                      help='Use an existing set of locations')
    parser.add_option('-w',
                      '--weatherbox',
                      dest='weatherbox',
                      default=False,
                      action='store_true',
                      help='Use only entries with an weatherbox')

    (options, args) = parser.parse_args()

    if len(args) < num_args:
        parser.print_help()
        sys.exit(1)

    existing = set()

    if options.existing is not None:
        with open(options.existing, 'r') as f:
            for line in f:
                print >> sys.stderr, "line:", line
                js = json.loads(line)
                existing.add(js.title)
                print line.strip()

    if args[0] == '-':
        f = sys.stdin
    else:
        f = open(args[0], 'r')

    counter = 0
    visited = 0

    for page in wp.wikipedia_to_single_line_pages(f):
        title = wp.parse_title(page)
        counter += 1

        if page.find('{{coord') >= 0 or page.find('latd') >= 0 or page.find(
                'lat_deg') >= 0 or page.find('lat_d') >= 0:
            if options.weatherbox:
                if page.find('weatherbox') < 0 and page.find("{{Weather") < 0:
                    continue

            visited += 1
            wait = 1
            found = False

            while not found:
                try:
                    url = "https://en.wikipedia.org/w/api.php?action=query&titles={}&prop=coordinates&format=json".format(
                        title.replace(' ', '%20'))
                    print >> sys.stderr, "pages:", (visited,
                                                    counter), "url:", url
                    f = urllib2.urlopen(url)
                    found = True
                except urllib2.HTTPError as he:
                    print >> sys.stderr, "Waiting..."
                    time.sleep(wait)
                    wait *= 2

                    if wait > 16:
                        break

            if not found:
                continue

            if f is not None:
                text = f.read()
                result = json.loads(text)

                if 'query' in result:
                    if 'pages' in result['query']:
                        for key in result['query']['pages']:
                            if 'coordinates' in result['query']['pages'][key]:
                                result['query']['pages'][key]['length'] = len(
                                    page)
                                print json.dumps(result['query']['pages'][key])
            else:
                print >> sys.stderr, "Failed to open the url"
def test_parse_weather_box():
    with open('test/data/denver_weatherbox.wiki', 'r') as f:
        for page in wp.wikipedia_to_single_line_pages(f):
            weather_box = wp.parse_weather_box(page)
            print >>sys.stderr, json.dumps(weather_box)
def test_astoria():

    with open('test/data/astoria.wiki', 'r') as f:
        for page in wp.wikipedia_to_single_line_pages(f):
            weather_box = wp.parse_weather_for_page(page)
            print >>sys.stderr, json.dumps(weather_box)
def test_latitude_longidue():
    with open('test/data/ithaca.wiki', 'r') as f:
        for page in wp.wikipedia_to_single_line_pages(f):
            lat, lon = wp.parse_longitude_latitude(page)

            print >>sys.stderr, "lat:", lat, "lon:", lon
def population_check(filename):
    with open(filename, 'r') as f:
        for page in wp.wikipedia_to_single_line_pages(f):
            population = wp.find_property(page, 'population_total')
            population = int(wp.extract_numeric(population.replace(',', '').replace(' ','')))
            print >>sys.stderr, "population:",  population
def main():
    usage = """
    python scripts/query_coordinates.py dump.wiki

    Iterate over all pages with an infobox and query the Wikipedia API
    for the coordinates of this page.
    """
    num_args = 1
    parser = OptionParser(usage=usage)

    # parser.add_option('-o', '--options', dest='some_option', default='yo', help="Place holder for a real option", type='str')
    # parser.add_option('-u', '--useless', dest='uselesss', default=False, action='store_true', help='Another useless option')
    parser.add_option("-e", "--existing", dest="existing", default=None, help="Use an existing set of locations")
    parser.add_option(
        "-w",
        "--weatherbox",
        dest="weatherbox",
        default=False,
        action="store_true",
        help="Use only entries with an weatherbox",
    )

    (options, args) = parser.parse_args()

    if len(args) < num_args:
        parser.print_help()
        sys.exit(1)

    existing = set()

    if options.existing is not None:
        with open(options.existing, "r") as f:
            for line in f:
                print >> sys.stderr, "line:", line
                js = json.loads(line)
                existing.add(js.title)
                print line.strip()

    if args[0] == "-":
        f = sys.stdin
    else:
        f = open(args[0], "r")

    counter = 0
    visited = 0

    for page in wp.wikipedia_to_single_line_pages(f):
        title = wp.parse_title(page)
        counter += 1

        if page.find("{{coord") >= 0 or page.find("latd") >= 0 or page.find("lat_deg") >= 0 or page.find("lat_d") >= 0:
            if options.weatherbox:
                if page.find("weatherbox") < 0 and page.find("{{Weather") < 0:
                    continue

            visited += 1
            wait = 1
            found = False

            while not found:
                try:
                    url = "https://en.wikipedia.org/w/api.php?action=query&titles={}&prop=coordinates&format=json".format(
                        title.replace(" ", "%20")
                    )
                    print >> sys.stderr, "pages:", (visited, counter), "url:", url
                    f = urllib2.urlopen(url)
                    found = True
                except urllib2.HTTPError as he:
                    print >> sys.stderr, "Waiting..."
                    time.sleep(wait)
                    wait *= 2

                    if wait > 16:
                        break

            if not found:
                continue

            if f is not None:
                text = f.read()
                result = json.loads(text)

                if "query" in result:
                    if "pages" in result["query"]:
                        for key in result["query"]["pages"]:
                            if "coordinates" in result["query"]["pages"][key]:
                                result["query"]["pages"][key]["length"] = len(page)
                                print json.dumps(result["query"]["pages"][key])
            else:
                print >> sys.stderr, "Failed to open the url"
def test_parse_weather_box():
    with open('test/data/denver_weatherbox.wiki', 'r') as f:
        for page in wp.wikipedia_to_single_line_pages(f):
            weather_box = wp.parse_weather_box(page)
            print >> sys.stderr, json.dumps(weather_box)
def test_astoria():

    with open('test/data/astoria.wiki', 'r') as f:
        for page in wp.wikipedia_to_single_line_pages(f):
            weather_box = wp.parse_weather_for_page(page)
            print >> sys.stderr, json.dumps(weather_box)
def test_latitude_longidue():
    with open('test/data/ithaca.wiki', 'r') as f:
        for page in wp.wikipedia_to_single_line_pages(f):
            lat, lon = wp.parse_longitude_latitude(page)

            print >> sys.stderr, "lat:", lat, "lon:", lon