def lastunseen(seriesName): parser = UnseenHTMLParser() page = GetPage.getpage(config.series_page)["page"] parser.feed(page) for u in parser.get_unseen(): if re.match('.*\/%s\/.*' % seriesName, u): return config.turbofilm_base + u
def listunseen(retlist=False): unseen = {} unseen_list = [] retstr = "\n" parser = UnseenHTMLParser() page = GetPage.getpage(config.series_page)["page"] parser.feed(page) for u in parser.get_unseen(): series = re.match('/Watch/(.*)/Season', u).groups()[0] if series: unseen.setdefault(series, []) unseen[series].append(u) for k in unseen.keys(): unseen_list.append((len(unseen[k]), k)) def comp(a,b): if a[0] > b[0]: return 1 elif a[0] < b[0]: return -1 else: return 0 unseen_list.sort(cmp=comp) if retlist: return unseen_list for e in unseen_list: if e[0] == 3: prefix = ">=" else: prefix = "==" retstr+=prefix+" %d\t%s\n" % e for e in unseen_list: try: os.mkdir(os.path.join(config.wrkdir,e[1])) except: pass retstr+= "\n"+"-"*20 + "\n\t%s\n" % parser.get_unseen_text() return retstr
def main(argv): email_of_interest = '' outputfile = 'out.txt' months = 1 try: opts, args = getopt.getopt(argv,"he:o:m:",["email=","ofile=","months="]) except getopt.GetoptError: print 'scrape.py -e <email> -o <outputfile> -m <months>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'scrape.py -e <email> -o <outputfile> -m <months>' sys.exit() elif opt in ("-e", "--email"): email_of_interest = arg.replace("@", " at ") elif opt in ("-o", "--ofile"): outputfile = arg elif opt in ("-m", "--months"): months = int(arg) page_lines = GetPage.get_page_contents(months) emails = ProcessContents.process_contents(page_lines, email_of_interest) f = open(outputfile, 'w') f.write(''.join(emails)) f.close()
def __init__(self): self.rpath = RfcPaths.RfcPaths() self.gp = GetPage.GetPage() self.gu = GetUrl.GetUrl() self.pno = 0 self.page = self.get_rfc_index() if self.page is not None: self.text_index = {} self.create_rfc_text_index()
def lastunseen_ssn(seriesName): parser = UnseenHTMLParser() page = GetPage.getpage(config.series_page)["page"] parser.feed(page) for u in parser.get_unseen(): if re.match('.*\/%s\/.*' % seriesName, u): m = re.match("/Watch/%s/Season([0-9]+)/Episode([0-9]+)$" % seriesName, u).groups() m = map(lambda a: int(a), m) m.insert(0, seriesName) return m
def getFreq(url): setupPath() h = str(hash(url)) path = os.path.join(CACHE_PATH, h) if os.path.exists(path): with open(path,"rb") as f: return pickle.load(f) else: page = GetPage.getURLText(url) freq = WordExtract.wordFreq(page) with open(path,"wb") as f: pickle.dump(freq,f) return freq
def watchEpisode(eid, offline=False): postdata = { "watch": 1, "eid": eid } if offline: try: f = open(config.offline_store) d = pickle.load(f) f.close() except IOError: d = [] d.append(postdata) f = open(config.offline_store, "w+") pickle.dump(d, f) f.close() return {'page': ''} else: return GetPage.getpage(config.watchUrl, postdata)
def watchEpisode(eid, offline=False): postdata = {"watch": 1, "eid": eid} if offline: try: f = open(config.offline_store) d = pickle.load(f) f.close() except IOError: d = [] d.append(postdata) f = open(config.offline_store, "w+") pickle.dump(d, f) f.close() return {'page': ''} else: return GetPage.getpage(config.watchUrl, postdata)
def getMicroblog(idd, pages, opener, MIDs,batch=20): #global db global sleepTime global randomTimes result = True try: for i in range(1, pages + 1): micros = 0 TEnd = time.time() TBegin = [0.0] for j in range(0, 3): try: #in fact, i don't know the details of this function text = GetPage.tryGetPage(idd, i, j, opener) except GetPage.FailureOverTimesException as e: print(e) continue except GetPage.LoadingFailingException as e: print(e) continue except GetPage.NetworkBusyException as e: print(e) continue microblogs = text.split("div action-type=\"feed_list_item\" ") micros += len(microblogs) if(len(microblogs)==1 and isLastOne(microblogs[0])): raise TimeLimitException for microblog in microblogs: if(not store(idd,microblog,MIDs,TBegin)): #store() will return a judgement for the microblog's birthday, then go on or end up raise TimeLimitException time.sleep(randomTimes * random.random() + sleepTime) if(i==1): TBegin[0] /= 1000 days = (TEnd-TBegin[0])/(3600*24)+0.1 # if microblog density is bigger than 1.5/day, return false. that is mean the microblog's comments won't be crawled if((micros/days)>0.5): result = False print("post too frequently") break finally: print("end grab microblog", idd) return result
def get_metadata(t_name, quality, offset=0): t_name, season, number = get_series_ssn(t_name, offset=offset) fname_base = "S%02dE%02d" % (int(season), int(number)) target_dir = config.wrkdir file_base = os.path.join(target_dir, t_name, fname_base) parser = MetaHTMLParser() page = GetPage.getpage(ssn_url(t_name, season, number))["page"].decode('utf-8') iasid = GetPage.p.check_ssid() parser.feed(page) try: xml_metadata = wb64(unquote(parser.metadata)) except AttributeError: print "No more episodes" sys.exit(0) #print "Got XML" # xml_metadata metadata = xml2obj.xml2obj(xml_metadata) metadata["fetched_quality"] = quality if metadata["sizes"]["hq"] == "0": metadata["fetched_quality"] = "default" quality = "default" metadata["iasid"] = iasid metadata["season"] = season metadata["number"] = number if not os.path.isdir(os.path.dirname(file_base)): os.mkdir(os.path.dirname(file_base)) if not os.path.isfile(file_base + ".meta") or os.stat(file_base + ".meta").st_size == 0: fd = open(file_base + ".meta", "w") fd.write(json.dumps(metadata)) fd.close() metadata.update({ 'bitrate': float(metadata["sizes"][quality]) / float(metadata['duration']) }) #print "bitrate: %s byte/sec" % metadata['bitrate'] return metadata, file_base
def getPageNumber(idd, opener): global db try: #tryGetPage(idd,page,pagebar,opener) why is pagebar here 3?? text = GetPage.tryGetPage(idd, 1, 3, opener) except GetPage.NoMicroblogException: return 0 #output(text) if text.find("微博列表") == -1: return 1 # # this should be repaired matches = re.search(r' (\d+) ', text) if matches is None: return 0 n = int(matches.group(1)) return n
def get_metadata(t_name, quality, offset=0): t_name, season, number = get_series_ssn(t_name, offset=offset) fname_base = "S%02dE%02d" % (int(season), int(number)) target_dir = config.wrkdir file_base = os.path.join(target_dir, t_name, fname_base) parser = MetaHTMLParser() page = GetPage.getpage(ssn_url(t_name, season, number))["page"].decode('utf-8') iasid = GetPage.p.check_ssid() parser.feed(page) try: xml_metadata = wb64(unquote(parser.metadata)) except AttributeError: print "No more episodes" sys.exit(0) #print "Got XML" # xml_metadata metadata = xml2obj.xml2obj(xml_metadata) metadata["fetched_quality"] = quality if metadata["sizes"]["hq"] == "0": metadata["fetched_quality"] = "default" quality = "default" metadata["iasid"] = iasid metadata["season"] = season metadata["number"] = number if not os.path.isdir(os.path.dirname(file_base)): os.mkdir(os.path.dirname(file_base)) if not os.path.isfile(file_base+".meta") or os.stat(file_base+".meta").st_size == 0: fd = open(file_base+".meta", "w") fd.write(json.dumps(metadata)) fd.close() metadata.update({'bitrate': float(metadata["sizes"][quality])/float(metadata['duration'])}) #print "bitrate: %s byte/sec" % metadata['bitrate'] return metadata, file_base
def unwatchEpisode(eid): postdata = { "watch": 0, "eid": eid } return GetPage.getpage(config.watchUrl, postdata)
def unwatchEpisode(eid): postdata = {"watch": 0, "eid": eid} return GetPage.getpage(config.watchUrl, postdata)