def __init__(self): self.db = dbconnecter() if self.db.connect(self.data['postgres']): self.rutracker = Jumper(url='http://rutracker.org/forum/viewforum.php', proxies=None) self.kinopoisk = Jumper('http://www.kinopoisk.ru/index.php', None) self.reCompile() self.posterurl = self.data['poster'] self.staticpath = self.data["static"] # self.rutracker.cookies = self.rutracker.post(url='http://login.rutracker.org/forum/login.php',logindata=self.data['login']) self.rutracker.setcookiejar_from_dict(self.data['cookies']) self.start()
import datetime from scrapy.selector import Selector import matplotlib.pyplot as plt import aprox from spider import Jumper gks_ru = Jumper(url='http://www.gks.ru/dbscripts/cbsd/dbinet.cgi', proxies=None) data = 'rdLayoutType=Au&_Pokazateli=on&_okato=on&_grtov=on&_god=on&_period=on&a_Pokazateli=1&a_okato=2&a_grtov=3&Qry=Pokazateli%3A1921002%3Bokato%3A80000000%3Bgrtov%3A1501%3Bgod%3A2011%2C2012%2C2013%2C2014%2C2015%3Bperiod%3A12011%2C22011%2C32011%2C42011%2C52011%2C62011%2C72011%2C82011%2C92011%2C102011%2C112011%2C122011%2C132011%2C142011%2C152011%2C162011%2C172011%2C182011%2C192011%2C202011%2C212011%2C222011%2C232011%2C242011%2C252011%2C262011%2C272011%2C282011%2C292011%2C302011%2C312011%2C322011%2C332011%2C342011%2C352011%2C362011%2C372011%2C382011%2C392011%2C402011%2C412011%2C422011%2C432011%2C442011%2C452011%2C462011%2C472011%2C482011%2C492011%2C502011%2C512011%2C12012%2C22012%2C32012%2C42012%2C52012%2C62012%2C72012%2C82012%2C92012%2C102012%2C112012%2C122012%2C132012%2C142012%2C152012%2C162012%2C172012%2C182012%2C192012%2C202012%2C212012%2C222012%2C232012%2C242012%2C252012%2C262012%2C272012%2C282012%2C292012%2C302012%2C312012%2C322012%2C332012%2C342012%2C352012%2C362012%2C372012%2C382012%2C392012%2C402012%2C412012%2C422012%2C432012%2C442012%2C452012%2C462012%2C472012%2C482012%2C492012%2C502012%2C512012%2C12013%2C22013%2C32013%2C42013%2C52013%2C62013%2C72013%2C82013%2C92013%2C102013%2C112013%2C122013%2C132013%2C142013%2C152013%2C162013%2C172013%2C182013%2C192013%2C202013%2C212013%2C222013%2C232013%2C242013%2C252013%2C262013%2C272013%2C282013%2C292013%2C302013%2C312013%2C322013%2C332013%2C342013%2C352013%2C362013%2C372013%2C382013%2C392013%2C402013%2C412013%2C422013%2C432013%2C442013%2C452013%2C462013%2C472013%2C482013%2C492013%2C502013%2C512013%2C522013%2C12014%2C22014%2C32014%2C42014%2C52014%2C62014%2C72014%2C82014%2C92014%2C102014%2C112014%2C122014%2C132014%2C142014%2C152014%2C162014%2C172014%2C182014%2C192014%2C202014%2C212014%2C222014%2C232014%2C242014%2C252014%2C262014%2C272014%2C282014%2C292014%2C302014%2C312014%2C322014%2C332014%2C342014%2C352014%2C362014%2C372014%2C382014%2C392014%2C402014%2C412014%2C422014%2C432014%2C442014%2C452014%2C462014%2C472014%2C482014%2C492014%2C502014%2C512014%2C12015%2C22015%2C32015%2C42015%2C52015%2C62015%2C72015%2C82015%2C92015%2C102015%2C112015%2C122015%2C132015%2C142015%2C152015%2C162015%2C172015%2C182015%2C192015%2C202015%2C212015%2C222015%2C232015%2C242015%2C252015%2C262015%2C272015%2C282015%2C292015%2C302015%2C312015%2C322015%2C332015%2C342015%2C352015%2C362015%2C372015%2C382015%2C392015%2C402015%2C412015%2C422015%2C432015%2C442015%2C452015%2C462015%2C472015%2C482015%3B&QryGm=Pokazateli_z%3A1%3Bokato_z%3A2%3Bgrtov_z%3A3%3Bgod_s%3A1%3Bperiod_b%3A1%3B&QryFootNotes=%3B&YearsList=2011%3B2012%3B2013%3B2014%3B2015%3B&tbl=%CF%EE%EA%E0%E7%E0%F2%FC+%F2%E0%E1%EB%E8%F6%F3' content = gks_ru.post(None,data,True) #file = open('content.html') #content = file.read() content = content.decode('cp1251') content = content.replace(u",", u'.') xpath = '//table[@class="OutTbl"]/tr[1]/td/text()' years = [int(td.extract()) for td in Selector(text=content).xpath(xpath)] values = [] weeks = [] print years xpath = '//table[@class="OutTbl"]/tr' count = 0 param = 2 column = 0 for tr in Selector(text=content).xpath(xpath)[1:]: count += 1 xpath = u'//td[{}]/text()'.format(param) tdn = Selector(text=tr.extract()).xpath(xpath).extract()[0] if tdn != u'\xa0': values.append(float(tdn)) else:
class Manager: path = os.path.dirname(os.path.abspath(__file__)) data = ast.literal_eval(open(path + r'\data.txt').read()) def __init__(self): self.db = dbconnecter() if self.db.connect(self.data['postgres']): self.rutracker = Jumper(url='http://rutracker.org/forum/viewforum.php', proxies=None) self.kinopoisk = Jumper('http://www.kinopoisk.ru/index.php', None) self.reCompile() self.posterurl = self.data['poster'] self.staticpath = self.data["static"] # self.rutracker.cookies = self.rutracker.post(url='http://login.rutracker.org/forum/login.php',logindata=self.data['login']) self.rutracker.setcookiejar_from_dict(self.data['cookies']) self.start() # print self.db.get('*','ruparser_topic','1=1') def start(self): for i in range(0, 300, 50): args = {'f': 2093, 'start': i} print 'get pos', i, content = self.rutracker.jump(args) if content is not None: print 'ok' self.parseTopics(content) tm = datetime.now() tm = tm.strftime("%d.%m.%y_%H-%M") self._save(self.path + r'\html\topics{}_{}.html'.format(args['start'], str(tm)), content) def reCompile(self): self.resc = re.compile(r'GB', re.UNICODE) self.rescfl = re.compile(r'[0.0-9]+', re.UNICODE) self.rename = re.compile('[\S]+[^/^(]+', re.UNICODE) self.reyear = re.compile(r'(?<=\[)\d{4}(?!\d)', re.UNICODE) self.renum = re.compile(r'\d+', re.UNICODE) self.retime = re.compile(r'\d+:\d+') self.resub = re.compile(r'<wbr>', re.UNICODE) self.req = re.compile(r'\'', re.UNICODE) def getPoster(self, id): id = str(id) raw = self.kinopoisk.raw(self.posterurl + str(id) + '.jpg') dir = self.staticpath + id[0:2] + '/' + id[2:4] + '/' + id[4] + '/' try: os.stat(dir) except: os.makedirs(dir) file = open(dir + id[5] + '.jpg', 'wb') file.writelines(raw) def dbfind(self, name, year): where = " name like '" + name.encode('utf8') + "%' and year='" + str(year) + "'" fid = self.db.getID('ruparser_film', where) if fid: print 'was found in dbase id=', fid return fid else: args = {u'level': u'7', u'first': u'yes', u'from': u'forma', u'result': u'adv', u'm_act[from]': u'forma', u'm_act[what]': u'content', u'm_act[find]': name.encode('utf8'), u'm_act[year]': year.encode('utf8')} content = self.kinopoisk.jump(args) # file = open('force.html') # content = file.read() if content is not None: return self.kinopars(name, year, content) def kinopars(self, name, year, content): content = content.decode('cp1251') content = content.encode('utf8') content = self.resub.sub('', content) def check(obj): if obj: return obj[0] else: return '' xpath = '//link[@rel="canonical"]/@href' id = Selector(text=content).xpath(xpath).extract() if id: id = self.renum.findall(id[0])[0] xpath = '//div[@class="brand_words"][@itemprop="description"]/text()' text = check(Selector(text=content).xpath(xpath).extract()) text = self.req.sub('', text) xpath = '//span[@class="rating_ball"]/text()' rating = check(Selector(text=content).xpath(xpath).extract()) if text and rating and id: print 'was found on kinopoisk.ru/film', id xpath = '//a[@class="popupBigImage"]/img/@src' poster = check(Selector(text=content).xpath(xpath).extract()) if poster == 'http://st.kp.yandex.net/images/movies/poster_none.png': poster = u'false' else: poster = u'true' xpath = '//span[@class="ratingCount"]/text()' count = Selector(text=content).xpath(xpath).extract() if count: count = count[0] count = count.replace(u'\xa0', u'') else: count = 0; print 'rating', rating, count, xpath = '//td[@class="time"]/text()' time = Selector(text=content).xpath(xpath).extract() nulltime = '0:0' if len(time) > 1: time = self.retime.findall(time[1]) if len(time) >= 1: time = time[0] else: time = nulltime elif len(time) == 1: time = self.renum.findall(time[0]) if len(time) >= 1: time = int(time[0]) th = time / 60 tm = time - (th * 60) time = str(th) + ':' + str(tm) else: time = nulltime else: time = nulltime print 'time', time, xpath = '//div[@id="block_rating"]/div[1]/div[2]/text()' imdb = check(Selector(text=content).xpath(xpath).extract()) if imdb: imdb = float(self.rescfl.findall(imdb)[0]) else: imdb = 0; print 'imdb:', imdb head = '(name,year,text,rating,count,imdb,time,kinopoiskid,poster)' values = (name.encode('utf8'), year, text.encode('utf8'), rating, count, imdb, time, id, poster) fid = self.db.insert('ruparser_film', head, values) return fid def kinosearch(self, fullname): try: name = self.rename.findall(fullname)[0] year = self.reyear.findall(fullname)[0] print 'search', name, year, return self.dbfind(name, year) except IndexError, er: print 'kinosearch', er return None