#!/usr/bin/env python # -*- coding: utf-8 -*- __author__ = 'lish' import imdb import sys reload(sys) sys.setdefaultencoding('utf-8') # base_path=os.path.split( os.path.realpath( sys.argv[0] ) )[0] base_path='/home/lish/imread/chapcontent/' IMOfficialDB=imdb.IMReadDB("192.168.0.34",3306,"ebook","ebook%$amRead") def ClassifiedTags(): selectsql='SELECT class_id FROM ebook_con.con_class WHERE class_id<60 and class_id<>0 and class_id<>11' results=IMOfficialDB.selectdb(selectsql) for result in results: classid=int(result[0]) insertsql="""INSERT INTO public_db.tmp_con_tag (tag_id,tag_name,tag_frequency,class_id) SELECT aa.tag_id,aa.tag_name,bb.num,bb.class_id FROM ebook_con.con_tag aa, (SELECT a.tag_id,COUNT(*)num,b.class_id FROM ebook_con.con_tag_content a, (SELECT book_id,book_name,book_tag,class_id FROM ebook_con.con_book WHERE class_id=%s)b WHERE a.content_id=b.book_id GROUP BY a.tag_id)bb WHERE aa.tag_id=bb.tag_id ORDER BY bb.num desc limit 200"""%classid results=IMOfficialDB.insertdb(insertsql) if __name__ == '__main__': ClassifiedTags()
# -*- coding: utf-8 -*- __author__ = 'lish' import imdb import os, sys base_path = '/opt/www/api/attachment/imread/chapcontent/' imreaddb = imdb.IMReadDB("100.98.73.21", 3306, "ebook", "4titbrVcvnP6LSFA") selectsql = 'SELECT book_id from ebook_con.con_book where mcp_id is null and source_id=2 and word_count is null' resultebids = imreaddb.selectdb(selectsql) for resultebid in resultebids: bid = int(resultebid[0]) print '正在更新图书bid:%s word_count字段内容!' % bid wordcount = 0 selectsql = 'SELECT chapter_id from ebook_con.con_chapter where book_id=%s' % bid resultecids = imreaddb.selectdb(selectsql) for resultecid in resultecids: cid = int(resultecid[0]) try: chapterpath = base_path + '%s/charpters/%s.txt' % (bid, cid) fr = open(chapterpath, 'r') conts = fr.readlines() for cont in conts: wordcount += len(cont.replace('\n', '').replace(' ', '')) / 3 except: continue wordcount = str(float(wordcount) / 10000) + '万' if wordcount != '0.0万': updatesql = "update ebook_con.con_book set word_count='%s' where book_id=%s" % (
import imdb, imcrawl, os, sys import ConfigParser global base_url, base_path, imreaddb conf_path = os.path.split(os.path.realpath(sys.argv[0]))[0] + '/' cf = ConfigParser.ConfigParser() cf.read(conf_path + "imopenapi.conf") base_url = cf.get("prefixurl", "base_url") base_path = cf.get("prefixpath", "base_path") db_port = cf.getint("db", "db_port") db_user = cf.get("db", "db_user") db_host = cf.get("db", "db_host") db_pass = cf.get("db", "db_pass") imreaddb = imdb.IMReadDB(db_host, db_port, db_user, db_pass) def crawlAPI(mcpid): selectsql = 'SELECT api_type,api_url from ebook_con.con_mcp_api where mcp_id=%s' % mcpid results = imreaddb.selectdb(selectsql) apiurls = {} for result in results: apiurls = dict(apiurls, **{result[0]: str(result[1])}) yuemingapp = imxml.IMxmlAPI(apiurls) bids = yuemingapp.BookIds() selectsql = 'select source_bid from ebook_con.con_book where mcp_id=%s' % mcpid isOldSids = [] results = imreaddb.selectdb(selectsql)
#!/usr/bin/env python # -*- coding:utf-8 -*- __author__ = 'lish' import os import imdb import sys reload(sys) sys.setdefaultencoding('utf-8') base_path = os.path.split(os.path.realpath(sys.argv[0]))[0] formaldb = imdb.IMReadDB("123.56.138.94", 3307, "ebook", "4titbrVcvnP6LSFA", "ebook_con") #图书对应的标签列表[tag1,tag2,..] def BooksToTag(tagid): # selectsql = 'SELECT content_id,tag_id FROM ebook_con.con_tag_content where tag_id = %d'%tagid selectsql = 'SELECT book_id,tag_id FROM public_db.tmp_con_tag_content where tag_id = %d' % tagid results = formaldb.selectdb(selectsql) BooksToTag = [] for result in results: bid = int(result[0]) # tagid = int(result[1]) BooksToTag.append(bid) return BooksToTag #相关度公式 def RelatedFormula(tag1, tag2):
records[(GNewid, Gid)] = Gvalue Gids.append(GNewid) # print Gids ##剔除Gconts中旧分类及分类规则键值对 for key, value in Gconts.items(): if isinstance(key, str): for oldG in value: del Gconts[oldG] return Gconts if __name__ == '__main__': imformaldb = imdb.IMReadDB("123.56.138.94", 3307, "ebook", "4titbrVcvnP6LSFA", "ebook_con") imstatisticaldb = imdb.IMReadDB("182.92.184.14", 3306, "cx_fujun", "fjfjie%mysql3", "ds_read") selectsql = 'SELECT book_id,SUM(read_uv) FROM ds_read.prd_bid_d WHERE stat_day>20161219 GROUP BY book_id' resluts = imstatisticaldb.selectdb(selectsql) resluts = [(int(reslut[0]), int(reslut[1])) for reslut in resluts] tagsdict = {} for result in resluts: selectsql = 'SELECT tag_id FROM ebook_con.con_tag_content WHERE content_id=%d' % result[ 0] tagids = imformaldb.selectdb(selectsql) for tagid in tagids: tagdictkey = int(tagid[0]) if tagdictkey not in tagsdict.keys(): tagsdict[tagdictkey] = int(result[1])
#!/usr/bin/env python # -*- coding:utf-8 -*- __author__ = 'lish' import urllib2 import jieba, json import imdb from collections import Counter import sys sys.path.append("../") sys.setdefaultencoding('utf-8') IMOfficialDB = imdb.IMReadDB("123.56.138.94", 3307, "ebook", "4titbrVcvnP6LSFA") ##获取图书用于分词的内容 def segerateCont(bookid): introduceapi = 'http://readapi.imread.com/api/v1/book/introduce?bid=%s&spm=1.120.0.1&scm=1.320644' % bookid introducecont = urllib2.urlopen(introduceapi).read() introducecont = json.loads(introducecont) alltext = introducecont['book_brief'] + introducecont['tag'] chapterlistapi = 'http://readapi.imread.com/api/v1/book/chapterlist?bid=%s&page=1&page_size=200000&order_type=asc&vt=9' % bookid chapterlistcont = urllib2.urlopen(chapterlistapi).read() chapterlistjsoncont = json.loads(chapterlistcont) for para in chapterlistjsoncont['chapterList']: if int(para['feeType']) == 0: chapterid = para['cid'] # print chapterid chaptercontapi = 'http://readapi.imread.com/api/v2/chapter/2/%s/%s/index?auto_pay=0&cm=M3540030' % ( bookid, chapterid) # print chaptercontapi chaptercontcont = urllib2.urlopen(chaptercontapi).read()