def insert(mddid,name,box_dict): dbconn.insert("mfwmdd", mddid = mddid, name = name, post_main = box_dict["post_main"], box_box_photo = box_dict["box_box_photo"], box_master = box_dict["box_master"], box_box_map = box_dict["box_box_map"], box_baike = box_dict["box_baike"], box_box_discuss = box_dict["box_box_discuss"], box_box_plan = box_dict["box_box_plan"], box_other_city = box_dict["box_other_city"], box_box_book = box_dict["box_box_book"], box_box_tips = box_dict["box_box_tips"], poi_nav = box_dict["poi_nav"], box_postBox = box_dict["box_postBox"], city_hd = box_dict["city_hd"], box_mb10 = box_dict["box_mb10"], box_mod_rec = box_dict["box_mod_rec"], box_hotBox2 = box_dict["box_hotBox2"], box_pathBox2 = box_dict["box_pathBox2"], box_pathBox1 = box_dict["box_pathBox1"], box_hotBox1 = box_dict["box_hotBox1"], info_cate = box_dict["info_cate"], box_other_mdd = box_dict["box_other_mdd"] )
def fileToNickname(): comp = re.compile(u"(\d+)_(\d+).html") for feedDir in os.listdir(tempDir): if not os.path.isdir(tempDir +"/"+ feedDir): continue for filename in os.listdir(tempDir +"/"+ feedDir): m = comp.search(filename) if not m: continue if int(m.group(2)) == 1: userid = int(m.group(1)) if len(dbconn.query("select * from mfwuser where userid = $userid",vars=dict(userid=userid))) == 0: nickname = getNickName(userid) dbconn.insert("mfwuser",userid=userid,nickname=nickname)
def getPagesAndCal(userid,lastpage): actDictList = [] for pagenumber in range(1,lastpage+1): html = open(tempDir + "/" + str(userid)[0:2] +"/" +str(userid) + "_" + str(pagenumber) + ".html").read() actDictList.extend( getFeed(html) ) if len(actDictList) == 0: dbconn.insert("mfwuserfeed", userid = userid, pageCount=lastpage, sumCount=0 ) return if len(actDictList) == 1: dbconn.insert("mfwuserfeed", userid = userid, pageCount=lastpage, sumCount=1, firstAct = actDictList[-1][0], firstActTime = actDictList[-1][1], mostAct = actDictList[-1][0], actSummaryString = "$" + str(actDictList[-1][0]) + "|1", actDense = 1, dateDense = 1 ) return ##registryTime = getRegistryTime() sumCount = len(actDictList) firstAct = actDictList[-1][0] firstActTime = actDictList[-1][1] actSummary,dateSummary = getActDateSummary(actDictList) mostAct = actSummary[0][0] actSummaryString = summaryToString(actSummary) actDense = calDense(actSummary) dateDense = calDense(dateSummary) longestPeriod, mostPeriod, deviation, avgPerd, middlePerd = calRate(dateSummary) print "sumCount:" + str(sumCount) print "firstAct:" + str(firstAct) print "firstActTime:" + str(firstActTime) print "mostAct:" + str(mostAct) print "actSummaryString:" + str(actSummaryString) print "actDense:" + str(actDense) print "dateDense:" + str(dateDense) print "longestPeriod:" + str(longestPeriod) print "mostPeriod:" + str(mostPeriod) print "deviation:" + str(deviation) print "avgPerd:" + str(avgPerd) print "middlePerd:" + str(middlePerd) dbconn.insert("mfwuserfeed", userid = userid, pageCount=lastpage, sumCount=sumCount, firstAct=firstAct, firstActTime = firstActTime, mostAct = mostAct, actSummaryString = actSummaryString, actDense = actDense, dateDense = dateDense, longestPeriod = longestPeriod, mostPeriod = mostPeriod, deviation = deviation, avgPerd = avgPerd, middlePerd = middlePerd )
def cal(className): if isinstance(className,list): tempName = "" for c in className: tempName = tempName + " " +c className = tempName if class_dict.has_key(className): class_dict[className] = class_dict[className] + 1 else: class_dict.update({className:1}) ##for mddDir in os.listdir(tempMddDir): ## if not os.path.isdir(tempMddDir +"/"+ mddDir): ## continue for filename in os.listdir(tempMddDir +"/1"): ana(open(tempMddDir+"/1/"+filename,"r").read()) f = open("d:/log/mmd.log","wb") for d in dictToOrderList(class_dict): f.write(d[0] + "," + str(d[1])+ "\r\n") """ res = dbconn.query("select distinct pid from mfwmdd where pid is not null") comp = re.compile(u"<title>(.+)地区旅游地图") for r in res: pid = r.pid html = open(tempMddDir+"/"+str(pid)[0]+"/"+str(pid)+".html","r").read() soup = BeautifulSoup(html,from_encoding="utf8") name = comp.search(unicode(soup.title)).group(1) dbconn.insert("mfwpid",pid=pid,name=name)
# coding:utf-8 from bs4 import BeautifulSoup import bs4, os from publicsettings import tempDir, dbconn moveDict = {} htmlfiles = os.listdir(tempDir) for htmlfile in htmlfiles: html = open(tempDir + "/" + htmlfile) soup = BeautifulSoup(html, from_encoding="utf8") news_lists = soup.find_all("div", "news_list") for news_list in news_lists: con = news_list.find("div", "con") ##print con.string text = "" for i in con.children: if isinstance(i, bs4.element.NavigableString): i = i.strip().encode("utf8") if i <> "": text = text + "$" + i if text <> "": if moveDict.has_key(text): moveDict[text] = moveDict[text] + 1 else: moveDict.update({text: 1}) for m in moveDict.keys(): dbconn.insert("mfwaction", name=m, count=moveDict[m])