def stations_parser(): #connect to database c = db.connect(database=r"./foreca.com.db") cu = c.cursor() doc = libxml2.htmlReadFile( r"./temp.html", "UTF-8", libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval( "/html/body/div/div/div[4]/div/div[2]/div[@class='col3']//a/@href") anchors2 = ctxt.xpathEval( "/html/body/div/div/div[4]/div/div[2]/div[@class='col3']//a/text()") i = 0 for anchor in anchors: print anchor.content print anchors2[i] name = normalizing(re.split("/", anchor.content)[-1]) cityurl = "http://foreca.com/%s" % (anchor.content) urllib.urlretrieve(cityurl, "./station%s.html" % (name)) doc1 = libxml2.htmlReadFile( r"./station%s.html" % (name), "UTF-8", libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING) ctxt1 = doc1.xpathNewContext() anchors1 = ctxt1.xpathEval( "/html/body/div/div/div[4]/div/div[2]/div/div/div[2]/a") for anchor1 in anchors1: if anchor1.prop("href"): break if (anchor1.prop("href").find("=") == -1): continue code = re.split("=", anchor1.prop("href"))[1] print name, "-", code real_name = "" real_name = anchors2[i].content real_name = normalizing(real_name) cur = cu.execute( "select id from stations where region_id='%s' and name = '%s'" % (id_region, real_name)) station_id = None for row in cur: station_id = row[0] if (station_id == None): cur = cu.execute( 'insert into stations (name, region_id, code) values ("%s", "%s", "%s")' % (real_name, id_region, code)) code = None c.commit() i = i + 1 os.unlink("./station%s.html" % (name)) c.close() doc.freeDoc()
def main(): # url = "http://foreca.com/%s/browse" % (country) print url urllib.urlretrieve(url, "./temp.html") if (first_letter == "A"): stations_parser() doc = libxml2.htmlReadFile( r"./temp.html", "UTF-8", libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval( "/html/body/div/div/div[4]/div/div[2]/div[4]/p/a/@href") for anchor in anchors: letter = re.split("=", anchor.content)[-1] if (letter < first_letter): continue new_url = url + "?bl=%s" % (letter) print new_url urllib.urlretrieve(new_url, "./temp.html") stations_parser() # os.unlink("./temp.html"); doc.freeDoc() return 0
def stations_parser(): #connect to database c = db.connect(database=r"./foreca.com.db") cu = c.cursor() doc = libxml2.htmlReadFile(r"./temp.html", "UTF-8", libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("/html/body/div/div/div[4]/div/div[2]/div[@class='col3']//a/@href") anchors2 = ctxt.xpathEval("/html/body/div/div/div[4]/div/div[2]/div[@class='col3']//a/text()") i = 0 for anchor in anchors: print anchor.content print anchors2[i] name = normalizing(re.split("/", anchor.content)[-1]) cityurl = "http://foreca.com/%s" %(anchor.content) urllib.urlretrieve (cityurl, "./station%s.html" %(name)) doc1 = libxml2.htmlReadFile(r"./station%s.html" %(name), "UTF-8", libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING) ctxt1 = doc1.xpathNewContext() anchors1 = ctxt1.xpathEval("/html/body/div/div/div[4]/div/div[2]/div/div/div[2]/a") for anchor1 in anchors1: if anchor1.prop("href"): break if (anchor1.prop("href").find("=") == -1): continue code = re.split("=", anchor1.prop("href"))[1] print name, "-", code; real_name = "" real_name = anchors2[i].content real_name = normalizing(real_name) cur = cu.execute("select id from stations where region_id='%s' and name = '%s'" %(id_region, real_name)) station_id= None for row in cur: station_id = row[0] if (station_id == None): cur = cu.execute('insert into stations (name, region_id, code) values ("%s", "%s", "%s")' % (real_name, id_region, code)) code = None c.commit() i = i + 1 os.unlink("./station%s.html"%(name)); c.close() doc.freeDoc()
def main(): # url = "http://foreca.com/%s/browse" %(country) print url urllib.urlretrieve (url, "./temp.html") if (first_letter == "A"): stations_parser() doc = libxml2.htmlReadFile(r"./temp.html", "UTF-8", libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("/html/body/div/div/div[4]/div/div[2]/div[4]/p/a/@href") for anchor in anchors: letter = re.split("=", anchor.content)[-1] if (letter < first_letter): continue new_url = url + "?bl=%s" %(letter) print new_url urllib.urlretrieve (new_url, "./temp.html") stations_parser() # os.unlink("./temp.html"); doc.freeDoc() return 0
fileToSave = mysock.read() oFile = open(r"./%s" % country_id,'wb') oFile.write(fileToSave) oFile.close #normalize file oFile = open(r"./%s" % country_id,'r') iFile = open(r"./%s.xml" % country_id,'w') iFile.write(oFile.read().replace("&", "&")) oFile.close iFile.close os.remove(r"./%s" % country_id) a = re.compile('\d\d\d+') #parse xml file doc = libxml2.htmlReadFile(r"./%s.xml" % country_id, "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//body/div/a") for anchor in anchors: href = anchor.prop("href") if href and '/gm/normal/node/prognoz_type/6/?field_wmo' in href: sql_string = 'insert into stations (region_id, russian_name, name, id_gismeteo_old, code) values ((select distinct regions.id from regions,countries where regions.country_id=countries.id and countries.id_gismeteo_old = %s), "%s", "%s", %s, %s)' % (country_id , anchor.content , anchor.content, a.search(href).group(), a.search(href).group()) print sql_string cu.execute(sql_string) c.commit() print sql_string # print href, " ", anchor.content # print a.search(href).group() doc.freeDoc()
page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./australia.html", 'wb') oFile.write(fileToSave) oFile.close #connect to database c = db.connect(database=r"./gismeteo.ru.db") cu = c.cursor() #Add id_gismeteo column to the regions table #cur = cu.execute("alter table regions add gismeteo_id numeric after country_id") #parse xml file doc = libxml2.htmlReadFile(r"./australia.html", "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//li") count_li = 0 ind1 = 0 for anchor in anchors: if anchor.name == "li": count_li = count_li + 1 # if count_li>14 and count_li<60: for europe # if count_li>14 and count_li<28: for UIR and Russia # if count_li>14 and count_li<39: for Asia # if count_li>14 and count_li<32: for Asia(Near East) # if count_li>14 and count_li<71: for Africa # if count_li>14 and count_li<52: for North and Central America if count_li > 14 and count_li < 33: text = anchor.content
page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./australia.html",'wb') oFile.write(fileToSave) oFile.close #connect to database c = db.connect(database=r"./gismeteo.ru.db") cu = c.cursor() #Add id_gismeteo column to the regions table #cur = cu.execute("alter table regions add gismeteo_id numeric after country_id") #parse xml file doc = libxml2.htmlReadFile(r"./australia.html" , "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//li") count_li = 0 ind1 = 0 for anchor in anchors: if anchor.name == "li": count_li = count_li + 1 # if count_li>14 and count_li<60: for europe # if count_li>14 and count_li<28: for UIR and Russia # if count_li>14 and count_li<39: for Asia # if count_li>14 and count_li<32: for Asia(Near East) # if count_li>14 and count_li<71: for Africa # if count_li>14 and count_li<52: for North and Central America if count_li>14 and count_li<33: text = anchor.content
url = 'http://ru.wikipedia.org/w/index.php?title=%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA_%D0%B3%D0%BE%D1%80%D0%BE%D0%B4%D0%BE%D0%B2_%D0%A4%D0%B8%D0%BD%D0%BB%D1%8F%D0%BD%D0%B4%D0%B8%D0%B8&printable=yes' req = urllib2.Request(url, None, {'User-agent': 'Mozilla/5.0'}) page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./finland.html", 'wb') oFile.write(fileToSave) oFile.close #connect to database c = db.connect(database=r"./gismeteo.ru.db") cu = c.cursor() #parse xml file doc = libxml2.htmlReadFile(r"./finland.html", "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//table") table = False count_of_table = 0 for anchor in anchors: if anchor.name == "table": count_of_table = count_of_table + 1 if count_of_table == 3: text = anchor.content a = re.compile('\d+') count_of_row = 1 russian_name = "" for stroka in string.split(text, "\n"):
id = row[1] country_name = country_name.encode('utf8') country_name_url = baseurl + "/place/" + country_name.replace(" ", "_") print country_name_url req = urllib2.Request(country_name_url, None, { 'User-agent': 'Mozilla/5.0', 'Accept-Language': 'ru' }) page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./%s.html" % (country_name), 'wb') oFile.write(fileToSave) oFile.close #parse xml file doc = libxml2.htmlReadFile(r"./%s.html" % (country_name), "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval( "//div[@class='yr-list-places clear clearfix']/dl/dd/a") for anchor in anchors: cu1 = c.cursor() cur1 = cu1.execute('select id from regions where name="%s"' % (anchor.content)) c.commit() row1 = "" for row1 in cur1: print row1 if (row1 != ""): region_name = anchor.content + '/' + country_name else: region_name = anchor.content
myrow += [row[0]] c.commit() # Main cicle for row in myrow: code = row print code req = urllib2.Request(url % code, None, {'User-agent': 'Mozilla/5.0'}) page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./temp.html",'wb') oFile.write(fileToSave) oFile.close #parse xml file doc = libxml2.htmlReadFile(r"./temp.html" , "UTF-8", libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("/html//div/h3/text()") for anchor in anchors: print "test" print anchor sql_string = 'update stations set name="%s" where id_gismeteo_new="%i"' % (anchor, code) print sql_string cu.execute(sql_string) c.commit() os.unlink("./temp.html"); c.close()
country_name = row[0] # if country_name[0] <= 'R' : # continue id = row[1] country_name = country_name.encode('utf8') country_name_url = baseurl + "/place/" + country_name.replace(" ","_") print country_name_url req = urllib2.Request(country_name_url, None, {'User-agent': 'Mozilla/5.0', 'Accept-Language':'ru'}) page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./%s.html"%(country_name),'wb') oFile.write(fileToSave) oFile.close #parse xml file doc = libxml2.htmlReadFile(r"./%s.html" % (country_name), "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//div[@class='yr-list-places clear clearfix']/dl/dd/a") for anchor in anchors: cu1 = c.cursor() cur1 = cu1.execute('select id from regions where name="%s"' % (anchor.content)) c.commit() row1 = "" for row1 in cur1: print row1 if (row1 != ""): region_name = anchor.content + '/' + country_name else: region_name = anchor.content cur1 = cu1.execute('insert into regions (name, country_id) values ("%s", "%s")' % (region_name, id)) c.commit()
url_states = 'http://gismeteo.by/city/catalog/cities/?country=US&id=181' req = urllib2.Request(url_states, None, {'User-agent': 'Mozilla/5.0'}) page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./states.html",'wb') oFile.write(fileToSave) oFile.close #connect to database c = db.connect(database=r"./gismeteo.ru.db") cu = c.cursor() #parse xml file doc = libxml2.htmlReadFile(r"./states.html" , "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//li") count_li = 0 ind1 = 0 text = "" for anchor in anchors: if anchor.name == "li": count_li = count_li + 1 #if count_li == 22: # count_li = count_li + 1 if count_li>15 and count_li<85: text = anchor.content if len(text) > 2: text_a = anchor.children prop = text_a.get_properties()
for s in self.subsections: s.render() def _update_index_refs(self): aa = self.contents.xpathEval(".//a") for a in aa: sid = a.prop("href")[1:] s = self.idmap.get(sid) if s: a.setProp("href", s.href) else: WARN("could not resolve toc href %s" % sid) if __name__ == '__main__': logging.basicConfig(level=logging.INFO) import sys doc = htmlReadFile(sys.argv[1], None, 0) INFO("read %s" % repr(doc)) d = Document(doc) d.analyze() d.render() # vi: ts=4 expandtab
print letter country_name_url = country_name.replace(" ", "_") print country_name req = urllib2.Request(url % (country_name_url, letter), None, { 'User-agent': 'Mozilla/5.0', 'Accept-Language': 'ru' }) page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./%s%s.html" % (country_name_url, letter), 'wb') oFile.write(fileToSave) oFile.close #parse xml file doc = libxml2.htmlReadFile(r"./%s%s.html" % (country_name_url, letter), "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//div/dl/dd/a") for anchor in anchors: href = anchor.prop("href") name_href = href.split('/') name = name_href[2].replace("'", "") russian_name = anchor.content.replace("'", "") print name, "-", russian_name cur = cu.execute( 'update stations set name="%s" where russian_name="%s" and name = "%s" and region_id = (select id from regions where name= "%s")' % (name, russian_name, russian_name, country_name)) c.commit() doc.freeDoc() os.remove(r"./%s%s.html" % (country_name_url, letter))
req = urllib2.Request(url, None, {'User-agent': 'Mozilla/5.0'}) page = urllib2.urlopen(req) fileToSave = page.read() oFile = open(r"./finland.html",'wb') oFile.write(fileToSave) oFile.close #connect to database c = db.connect(database=r"./gismeteo.ru.db") cu = c.cursor() #parse xml file doc = libxml2.htmlReadFile(r"./finland.html" , "UTF-8", libxml2.HTML_PARSE_RECOVER) ctxt = doc.xpathNewContext() anchors = ctxt.xpathEval("//table") table = False count_of_table = 0 for anchor in anchors: if anchor.name == "table": count_of_table = count_of_table +1 if count_of_table == 3: text = anchor.content a = re.compile('\d+') count_of_row = 1 russian_name = "" for stroka in string.split(text, "\n"):