Ejemplo n.º 1
0
def stations_parser():

    #connect to database
    c = db.connect(database=r"./foreca.com.db")
    cu = c.cursor()

    doc = libxml2.htmlReadFile(
        r"./temp.html", "UTF-8", libxml2.HTML_PARSE_RECOVER +
        libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING)
    ctxt = doc.xpathNewContext()
    anchors = ctxt.xpathEval(
        "/html/body/div/div/div[4]/div/div[2]/div[@class='col3']//a/@href")
    anchors2 = ctxt.xpathEval(
        "/html/body/div/div/div[4]/div/div[2]/div[@class='col3']//a/text()")
    i = 0
    for anchor in anchors:
        print anchor.content
        print anchors2[i]
        name = normalizing(re.split("/", anchor.content)[-1])
        cityurl = "http://foreca.com/%s" % (anchor.content)
        urllib.urlretrieve(cityurl, "./station%s.html" % (name))
        doc1 = libxml2.htmlReadFile(
            r"./station%s.html" % (name), "UTF-8", libxml2.HTML_PARSE_RECOVER +
            libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING)
        ctxt1 = doc1.xpathNewContext()
        anchors1 = ctxt1.xpathEval(
            "/html/body/div/div/div[4]/div/div[2]/div/div/div[2]/a")
        for anchor1 in anchors1:
            if anchor1.prop("href"):
                break
        if (anchor1.prop("href").find("=") == -1):
            continue
        code = re.split("=", anchor1.prop("href"))[1]
        print name, "-", code
        real_name = ""
        real_name = anchors2[i].content
        real_name = normalizing(real_name)
        cur = cu.execute(
            "select id from stations where region_id='%s' and name = '%s'" %
            (id_region, real_name))
        station_id = None
        for row in cur:
            station_id = row[0]
        if (station_id == None):
            cur = cu.execute(
                'insert into stations (name, region_id, code) values  ("%s", "%s", "%s")'
                % (real_name, id_region, code))
        code = None
        c.commit()
        i = i + 1
        os.unlink("./station%s.html" % (name))

    c.close()
    doc.freeDoc()
Ejemplo n.º 2
0
def main():
    #
    url = "http://foreca.com/%s/browse" % (country)
    print url

    urllib.urlretrieve(url, "./temp.html")
    if (first_letter == "A"):
        stations_parser()
    doc = libxml2.htmlReadFile(
        r"./temp.html", "UTF-8", libxml2.HTML_PARSE_RECOVER +
        libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING)
    ctxt = doc.xpathNewContext()
    anchors = ctxt.xpathEval(
        "/html/body/div/div/div[4]/div/div[2]/div[4]/p/a/@href")
    for anchor in anchors:
        letter = re.split("=", anchor.content)[-1]
        if (letter < first_letter):
            continue
        new_url = url + "?bl=%s" % (letter)
        print new_url
        urllib.urlretrieve(new_url, "./temp.html")
        stations_parser()


#    os.unlink("./temp.html");
    doc.freeDoc()
    return 0
Ejemplo n.º 3
0
def stations_parser():

    #connect to database
    c = db.connect(database=r"./foreca.com.db")
    cu = c.cursor()

    doc = libxml2.htmlReadFile(r"./temp.html", "UTF-8", libxml2.HTML_PARSE_RECOVER +
							libxml2.HTML_PARSE_NOERROR +
                                                        libxml2.HTML_PARSE_NOWARNING) 
    ctxt = doc.xpathNewContext()
    anchors = ctxt.xpathEval("/html/body/div/div/div[4]/div/div[2]/div[@class='col3']//a/@href")
    anchors2 = ctxt.xpathEval("/html/body/div/div/div[4]/div/div[2]/div[@class='col3']//a/text()")
    i = 0
    for anchor in anchors:
        print anchor.content
        print anchors2[i]
        name = normalizing(re.split("/", anchor.content)[-1])
        cityurl = "http://foreca.com/%s" %(anchor.content)
        urllib.urlretrieve (cityurl, "./station%s.html" %(name))
        doc1 = libxml2.htmlReadFile(r"./station%s.html" %(name), "UTF-8", libxml2.HTML_PARSE_RECOVER +
                                                                          libxml2.HTML_PARSE_NOERROR +
                                                                          libxml2.HTML_PARSE_NOWARNING)
        ctxt1 = doc1.xpathNewContext()
        anchors1 = ctxt1.xpathEval("/html/body/div/div/div[4]/div/div[2]/div/div/div[2]/a")
        for anchor1 in anchors1:
            if anchor1.prop("href"):
                break
        if (anchor1.prop("href").find("=") == -1):
            continue
        code = re.split("=", anchor1.prop("href"))[1]
        print name, "-", code;
        real_name = ""
        real_name = anchors2[i].content
        real_name = normalizing(real_name)
        cur = cu.execute("select id from stations where region_id='%s' and name = '%s'" %(id_region, real_name))
        station_id= None
        for row in cur:
            station_id = row[0]
        if (station_id == None):
            cur = cu.execute('insert into stations (name, region_id, code) values  ("%s", "%s", "%s")' % (real_name, id_region, code))
        code = None
        c.commit()
        i = i + 1
        os.unlink("./station%s.html"%(name));

    c.close()  
    doc.freeDoc()
Ejemplo n.º 4
0
def main():
    #
    url = "http://foreca.com/%s/browse" %(country) 
    print url
	
    urllib.urlretrieve (url, "./temp.html")
    if (first_letter == "A"): 
        stations_parser()
    doc = libxml2.htmlReadFile(r"./temp.html", "UTF-8", libxml2.HTML_PARSE_RECOVER + 
							libxml2.HTML_PARSE_NOERROR +
                                                        libxml2.HTML_PARSE_NOWARNING)
    ctxt = doc.xpathNewContext()
    anchors = ctxt.xpathEval("/html/body/div/div/div[4]/div/div[2]/div[4]/p/a/@href")
    for anchor in anchors:
        letter = re.split("=", anchor.content)[-1]
        if (letter < first_letter):
            continue
        new_url = url + "?bl=%s" %(letter)
        print new_url
        urllib.urlretrieve (new_url, "./temp.html")
        stations_parser()
#    os.unlink("./temp.html");
    doc.freeDoc()
    return 0
Ejemplo n.º 5
0
    fileToSave = mysock.read()
    oFile = open(r"./%s" % country_id,'wb')
    oFile.write(fileToSave)
    oFile.close

    #normalize file
    oFile = open(r"./%s" % country_id,'r')
    iFile = open(r"./%s.xml" % country_id,'w')
    iFile.write(oFile.read().replace("&", "&amp;"))
    oFile.close
    iFile.close
    os.remove(r"./%s" % country_id)

    a = re.compile('\d\d\d+')
    #parse xml file
    doc = libxml2.htmlReadFile(r"./%s.xml" % country_id, "UTF-8", libxml2.HTML_PARSE_RECOVER)
    ctxt = doc.xpathNewContext()
    anchors = ctxt.xpathEval("//body/div/a")
    for anchor in anchors:
        href = anchor.prop("href")
        if href and '/gm/normal/node/prognoz_type/6/?field_wmo' in href:
            sql_string = 'insert into stations (region_id, russian_name, name, id_gismeteo_old, code) values ((select distinct regions.id from regions,countries where regions.country_id=countries.id and countries.id_gismeteo_old = %s), "%s", "%s", %s, %s)'  % (country_id , anchor.content , anchor.content, a.search(href).group(), a.search(href).group())
            print sql_string
            cu.execute(sql_string)
            c.commit()
            print sql_string
#            print href, " ", anchor.content
#            print a.search(href).group()


    doc.freeDoc()
Ejemplo n.º 6
0
page = urllib2.urlopen(req)

fileToSave = page.read()
oFile = open(r"./australia.html", 'wb')
oFile.write(fileToSave)
oFile.close

#connect to database
c = db.connect(database=r"./gismeteo.ru.db")
cu = c.cursor()

#Add id_gismeteo column to the regions table
#cur = cu.execute("alter table regions add gismeteo_id numeric after country_id")

#parse xml file
doc = libxml2.htmlReadFile(r"./australia.html", "UTF-8",
                           libxml2.HTML_PARSE_RECOVER)
ctxt = doc.xpathNewContext()
anchors = ctxt.xpathEval("//li")
count_li = 0
ind1 = 0
for anchor in anchors:
    if anchor.name == "li":
        count_li = count_li + 1
# if count_li>14 and count_li<60: for europe
# if count_li>14 and count_li<28: for UIR and Russia
# if count_li>14 and count_li<39: for Asia
# if count_li>14 and count_li<32: for Asia(Near East)
# if count_li>14 and count_li<71: for Africa
# if count_li>14 and count_li<52: for North and Central America
    if count_li > 14 and count_li < 33:
        text = anchor.content
Ejemplo n.º 7
0
page = urllib2.urlopen(req)

fileToSave = page.read()
oFile = open(r"./australia.html",'wb')
oFile.write(fileToSave)
oFile.close

#connect to database
c = db.connect(database=r"./gismeteo.ru.db")
cu = c.cursor()

#Add id_gismeteo column to the regions table
#cur = cu.execute("alter table regions add gismeteo_id numeric after country_id")

#parse xml file
doc = libxml2.htmlReadFile(r"./australia.html" , "UTF-8", libxml2.HTML_PARSE_RECOVER)
ctxt = doc.xpathNewContext()
anchors = ctxt.xpathEval("//li")
count_li = 0
ind1 = 0
for anchor in anchors:
    if anchor.name == "li":
        count_li = count_li + 1
   # if count_li>14 and count_li<60: for europe
   # if count_li>14 and count_li<28: for UIR and Russia
   # if count_li>14 and count_li<39: for Asia
   # if count_li>14 and count_li<32: for Asia(Near East)
   # if count_li>14 and count_li<71: for Africa
   # if count_li>14 and count_li<52: for North and Central America
    if count_li>14 and count_li<33:
        text = anchor.content
Ejemplo n.º 8
0
url = 'http://ru.wikipedia.org/w/index.php?title=%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D0%B3%D0%BE%D1%80%D0%BE%D0%B4%D0%BE%D0%B2_%D0%A4%D0%B8%D0%BD%D0%BB%D1%8F%D0%BD%D0%B4%D0%B8%D0%B8&printable=yes'

req = urllib2.Request(url, None, {'User-agent': 'Mozilla/5.0'})
page = urllib2.urlopen(req)

fileToSave = page.read()
oFile = open(r"./finland.html", 'wb')
oFile.write(fileToSave)
oFile.close

#connect to database
c = db.connect(database=r"./gismeteo.ru.db")
cu = c.cursor()

#parse xml file
doc = libxml2.htmlReadFile(r"./finland.html", "UTF-8",
                           libxml2.HTML_PARSE_RECOVER)
ctxt = doc.xpathNewContext()
anchors = ctxt.xpathEval("//table")
table = False
count_of_table = 0
for anchor in anchors:
    if anchor.name == "table":
        count_of_table = count_of_table + 1
    if count_of_table == 3:
        text = anchor.content

a = re.compile('\d+')
count_of_row = 1
russian_name = ""
for stroka in string.split(text, "\n"):
Ejemplo n.º 9
0
    id = row[1]
    country_name = country_name.encode('utf8')
    country_name_url = baseurl + "/place/" + country_name.replace(" ", "_")
    print country_name_url
    req = urllib2.Request(country_name_url, None, {
        'User-agent': 'Mozilla/5.0',
        'Accept-Language': 'ru'
    })
    page = urllib2.urlopen(req)

    fileToSave = page.read()
    oFile = open(r"./%s.html" % (country_name), 'wb')
    oFile.write(fileToSave)
    oFile.close
    #parse xml file
    doc = libxml2.htmlReadFile(r"./%s.html" % (country_name), "UTF-8",
                               libxml2.HTML_PARSE_RECOVER)
    ctxt = doc.xpathNewContext()
    anchors = ctxt.xpathEval(
        "//div[@class='yr-list-places clear clearfix']/dl/dd/a")
    for anchor in anchors:
        cu1 = c.cursor()
        cur1 = cu1.execute('select id from regions where name="%s"' %
                           (anchor.content))
        c.commit()
        row1 = ""
        for row1 in cur1:
            print row1
        if (row1 != ""):
            region_name = anchor.content + '/' + country_name
        else:
            region_name = anchor.content
Ejemplo n.º 10
0
    myrow += [row[0]]
    c.commit()
# Main cicle
for row in myrow:
    code = row
    print code 

    req = urllib2.Request(url % code, None, {'User-agent': 'Mozilla/5.0'})
    page = urllib2.urlopen(req)

    fileToSave = page.read()
    oFile = open(r"./temp.html",'wb')
    oFile.write(fileToSave)
    oFile.close
    #parse xml file
    doc = libxml2.htmlReadFile(r"./temp.html" , "UTF-8", libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING)
    ctxt = doc.xpathNewContext()
    anchors = ctxt.xpathEval("/html//div/h3/text()")
    for anchor in anchors:
        print "test"
        print anchor
        sql_string = 'update stations set name="%s" where id_gismeteo_new="%i"'  % (anchor, code)
        print sql_string
        cu.execute(sql_string)
        c.commit()

    os.unlink("./temp.html"); 



c.close()
Ejemplo n.º 11
0
    country_name = row[0]
#    if country_name[0] <= 'R' :
#        continue
    id = row[1]
    country_name = country_name.encode('utf8')
    country_name_url = baseurl + "/place/" + country_name.replace(" ","_")
    print country_name_url
    req = urllib2.Request(country_name_url, None, {'User-agent': 'Mozilla/5.0', 'Accept-Language':'ru'})
    page = urllib2.urlopen(req)

    fileToSave = page.read()
    oFile = open(r"./%s.html"%(country_name),'wb')
    oFile.write(fileToSave)
    oFile.close
    #parse xml file
    doc = libxml2.htmlReadFile(r"./%s.html" % (country_name), "UTF-8", libxml2.HTML_PARSE_RECOVER)
    ctxt = doc.xpathNewContext()
    anchors = ctxt.xpathEval("//div[@class='yr-list-places clear clearfix']/dl/dd/a")
    for anchor in anchors:
        cu1 = c.cursor()
        cur1 = cu1.execute('select id from regions where name="%s"' % (anchor.content))
        c.commit()
        row1 = ""
        for row1 in cur1:
            print row1  
        if (row1 != ""):
            region_name = anchor.content + '/' + country_name
        else:
            region_name = anchor.content 
        cur1 = cu1.execute('insert into regions (name, country_id) values  ("%s", "%s")' % (region_name, id))
        c.commit()
Ejemplo n.º 12
0
url_states = 'http://gismeteo.by/city/catalog/cities/?country=US&id=181'
req = urllib2.Request(url_states, None, {'User-agent': 'Mozilla/5.0'})
page = urllib2.urlopen(req)

fileToSave = page.read()
oFile = open(r"./states.html",'wb')
oFile.write(fileToSave)
oFile.close

#connect to database
c = db.connect(database=r"./gismeteo.ru.db")
cu = c.cursor()

#parse xml file
doc = libxml2.htmlReadFile(r"./states.html" , "UTF-8", libxml2.HTML_PARSE_RECOVER)
ctxt = doc.xpathNewContext()
anchors = ctxt.xpathEval("//li")
count_li = 0
ind1 = 0
text = ""
for anchor in anchors:
    if anchor.name == "li":
      count_li = count_li + 1
    #if count_li == 22:
    #  count_li = count_li + 1
    if count_li>15 and count_li<85:
        text = anchor.content
        if len(text) > 2:
            text_a = anchor.children
            prop = text_a.get_properties()
Ejemplo n.º 13
0
            for s in self.subsections:
                s.render()

    def _update_index_refs(self):

        aa = self.contents.xpathEval(".//a")
        for a in aa:
            sid = a.prop("href")[1:]
            s = self.idmap.get(sid)
            if s:
                a.setProp("href", s.href)
            else:
                WARN("could not resolve toc href %s" % sid)

if __name__ == '__main__':

    logging.basicConfig(level=logging.INFO)
    import sys

    doc = htmlReadFile(sys.argv[1], None, 0)
    INFO("read %s" % repr(doc))

    d = Document(doc)
    d.analyze()
    d.render()

# vi: ts=4 expandtab


Ejemplo n.º 14
0
        print letter
        country_name_url = country_name.replace(" ", "_")
        print country_name
        req = urllib2.Request(url % (country_name_url, letter), None, {
            'User-agent': 'Mozilla/5.0',
            'Accept-Language': 'ru'
        })
        page = urllib2.urlopen(req)

        fileToSave = page.read()
        oFile = open(r"./%s%s.html" % (country_name_url, letter), 'wb')
        oFile.write(fileToSave)
        oFile.close

        #parse xml file
        doc = libxml2.htmlReadFile(r"./%s%s.html" % (country_name_url, letter),
                                   "UTF-8", libxml2.HTML_PARSE_RECOVER)
        ctxt = doc.xpathNewContext()
        anchors = ctxt.xpathEval("//div/dl/dd/a")
        for anchor in anchors:
            href = anchor.prop("href")
            name_href = href.split('/')
            name = name_href[2].replace("'", "")
            russian_name = anchor.content.replace("'", "")
            print name, "-", russian_name
            cur = cu.execute(
                'update  stations set name="%s" where russian_name="%s" and name = "%s" and region_id = (select id from regions where name= "%s")'
                % (name, russian_name, russian_name, country_name))
            c.commit()

        doc.freeDoc()
        os.remove(r"./%s%s.html" % (country_name_url, letter))
Ejemplo n.º 15
0

req = urllib2.Request(url, None, {'User-agent': 'Mozilla/5.0'})
page = urllib2.urlopen(req)

fileToSave = page.read()
oFile = open(r"./finland.html",'wb')
oFile.write(fileToSave)
oFile.close

#connect to database
c = db.connect(database=r"./gismeteo.ru.db")
cu = c.cursor()

#parse xml file
doc = libxml2.htmlReadFile(r"./finland.html" , "UTF-8", libxml2.HTML_PARSE_RECOVER)
ctxt = doc.xpathNewContext()
anchors = ctxt.xpathEval("//table")
table = False
count_of_table = 0
for anchor in anchors:
    if anchor.name == "table":
        count_of_table = count_of_table +1
    if count_of_table == 3:
        text = anchor.content

a = re.compile('\d+')
count_of_row = 1
russian_name = ""
for stroka in string.split(text, "\n"):