Example #1
0
def lastunseen(seriesName):
	parser = UnseenHTMLParser()
	page = GetPage.getpage(config.series_page)["page"]
	parser.feed(page)
	for u in parser.get_unseen():
			if re.match('.*\/%s\/.*' % seriesName, u):
					return config.turbofilm_base + u
Example #2
0
def listunseen(retlist=False):
	unseen = {}
	unseen_list = []
	retstr = "\n"
	parser = UnseenHTMLParser()
	page = GetPage.getpage(config.series_page)["page"]
	parser.feed(page)
	for u in parser.get_unseen():
			series = re.match('/Watch/(.*)/Season', u).groups()[0]
			if series:
					unseen.setdefault(series, [])
					unseen[series].append(u)
	for k in unseen.keys():
			unseen_list.append((len(unseen[k]), k))
			def comp(a,b):
					if a[0] > b[0]: return 1
					elif a[0] < b[0]: return -1
					else: return 0
			unseen_list.sort(cmp=comp)
	if retlist: return unseen_list
	for e in unseen_list:
			if e[0] == 3: 
					prefix = ">="
			else: prefix = "=="
			retstr+=prefix+" %d\t%s\n" % e
	for e in unseen_list:
			try:
					os.mkdir(os.path.join(config.wrkdir,e[1]))
			except: pass
	retstr+= "\n"+"-"*20 + "\n\t%s\n" % parser.get_unseen_text()
	return retstr
def main(argv):
    email_of_interest = ''
    outputfile = 'out.txt'
    months = 1

    try:
        opts, args = getopt.getopt(argv,"he:o:m:",["email=","ofile=","months="])
    except getopt.GetoptError:
        print 'scrape.py -e <email> -o <outputfile> -m <months>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'scrape.py -e <email> -o <outputfile> -m <months>'
            sys.exit()
        elif opt in ("-e", "--email"):
            email_of_interest = arg.replace("@", " at ")
        elif opt in ("-o", "--ofile"):
            outputfile = arg
        elif opt in ("-m", "--months"):
            months = int(arg)

    page_lines = GetPage.get_page_contents(months)
    emails = ProcessContents.process_contents(page_lines, email_of_interest)
    
    f = open(outputfile, 'w')
    f.write(''.join(emails))
    f.close()
Example #4
0
 def __init__(self):
     self.rpath = RfcPaths.RfcPaths()
     self.gp = GetPage.GetPage()
     self.gu = GetUrl.GetUrl()
     self.pno = 0
     self.page = self.get_rfc_index()
     if self.page is not None:
         self.text_index = {}
         self.create_rfc_text_index()
Example #5
0
def lastunseen_ssn(seriesName):
	parser = UnseenHTMLParser()
	page = GetPage.getpage(config.series_page)["page"]
	parser.feed(page)
	for u in parser.get_unseen():
			if re.match('.*\/%s\/.*' % seriesName, u):
					m =  re.match("/Watch/%s/Season([0-9]+)/Episode([0-9]+)$" % seriesName, u).groups()
					m = map(lambda a: int(a), m)
					m.insert(0, seriesName)
					return m
Example #6
0
def getFreq(url):
	setupPath()
	h = str(hash(url))
	path = os.path.join(CACHE_PATH, h)
	if os.path.exists(path):
		with open(path,"rb") as f:
			return pickle.load(f)
	else:
		page = GetPage.getURLText(url)
		freq = WordExtract.wordFreq(page)
		
		with open(path,"wb") as f:
			pickle.dump(freq,f)
		
		return freq
Example #7
0
def watchEpisode(eid, offline=False):
    postdata = { "watch": 1, "eid": eid }
    if offline:
        try:
            f = open(config.offline_store)
            d = pickle.load(f)
            f.close()
        except IOError:
            d = []
        d.append(postdata)
        f = open(config.offline_store, "w+")
        pickle.dump(d, f)
        f.close()
        return {'page': ''}
    else:
        return GetPage.getpage(config.watchUrl, postdata)
Example #8
0
def watchEpisode(eid, offline=False):
    postdata = {"watch": 1, "eid": eid}
    if offline:
        try:
            f = open(config.offline_store)
            d = pickle.load(f)
            f.close()
        except IOError:
            d = []
        d.append(postdata)
        f = open(config.offline_store, "w+")
        pickle.dump(d, f)
        f.close()
        return {'page': ''}
    else:
        return GetPage.getpage(config.watchUrl, postdata)
Example #9
0
def getMicroblog(idd, pages, opener, MIDs,batch=20):    
    #global db
    global sleepTime
    global randomTimes
    result = True
    try:
        for i in range(1, pages + 1):
            micros = 0
            TEnd = time.time()
            TBegin = [0.0]
            for j in range(0, 3):    
                try:    
                    #in fact, i don't know the details of this function
                    text = GetPage.tryGetPage(idd, i, j, opener)        
                except GetPage.FailureOverTimesException as e:
                    print(e)
                    continue
                except GetPage.LoadingFailingException as e:
                    print(e)
                    continue
                except GetPage.NetworkBusyException as e:
                    print(e)
                    continue 
        
                microblogs = text.split("div action-type=\"feed_list_item\" ")
                micros += len(microblogs)
                if(len(microblogs)==1 and isLastOne(microblogs[0])):
                    raise TimeLimitException
                for microblog in microblogs:
                    if(not store(idd,microblog,MIDs,TBegin)):          #store() will return a judgement for the microblog's birthday, then go on or end up
                        raise TimeLimitException
                time.sleep(randomTimes * random.random() + sleepTime)
            if(i==1):
                TBegin[0] /= 1000 
                days = (TEnd-TBegin[0])/(3600*24)+0.1
                # if microblog density is bigger than 1.5/day, return false. that is mean the microblog's comments won't be crawled
                if((micros/days)>0.5):   
                    result = False
                    print("post too frequently")
                    break            
    finally:
        print("end grab microblog", idd)
        return result
Example #10
0
def get_metadata(t_name, quality, offset=0):
    t_name, season, number = get_series_ssn(t_name, offset=offset)
    fname_base = "S%02dE%02d" % (int(season), int(number))

    target_dir = config.wrkdir
    file_base = os.path.join(target_dir, t_name, fname_base)
    parser = MetaHTMLParser()
    page = GetPage.getpage(ssn_url(t_name, season,
                                   number))["page"].decode('utf-8')
    iasid = GetPage.p.check_ssid()
    parser.feed(page)
    try:
        xml_metadata = wb64(unquote(parser.metadata))
    except AttributeError:
        print "No more episodes"
        sys.exit(0)
    #print "Got XML" # xml_metadata

    metadata = xml2obj.xml2obj(xml_metadata)
    metadata["fetched_quality"] = quality
    if metadata["sizes"]["hq"] == "0":
        metadata["fetched_quality"] = "default"
        quality = "default"
    metadata["iasid"] = iasid
    metadata["season"] = season
    metadata["number"] = number

    if not os.path.isdir(os.path.dirname(file_base)):
        os.mkdir(os.path.dirname(file_base))

    if not os.path.isfile(file_base +
                          ".meta") or os.stat(file_base +
                                              ".meta").st_size == 0:
        fd = open(file_base + ".meta", "w")
        fd.write(json.dumps(metadata))
        fd.close()
    metadata.update({
        'bitrate':
        float(metadata["sizes"][quality]) / float(metadata['duration'])
    })
    #print "bitrate: %s byte/sec" % metadata['bitrate']
    return metadata, file_base
Example #11
0
def getPageNumber(idd, opener):
    global db  
    try: 
        #tryGetPage(idd,page,pagebar,opener)  why is pagebar here 3?? 
        text = GetPage.tryGetPage(idd, 1, 3, opener)
    except GetPage.NoMicroblogException:
        return 0
    
    
    #output(text)
    if text.find("微博列表") == -1:  
        return 1
    # # this should be repaired
    matches = re.search(r'&nbsp;(\d+)&nbsp;', text)   
    if matches is None:
        return 0  
    
    n = int(matches.group(1))
    
    return n
Example #12
0
def get_metadata(t_name, quality, offset=0):
    t_name, season, number = get_series_ssn(t_name, offset=offset)
    fname_base = "S%02dE%02d" % (int(season), int(number))

    target_dir = config.wrkdir
    file_base = os.path.join(target_dir, t_name, fname_base)
    parser = MetaHTMLParser()
    page = GetPage.getpage(ssn_url(t_name, season, number))["page"].decode('utf-8')
    iasid = GetPage.p.check_ssid()
    parser.feed(page)
    try:
        xml_metadata = wb64(unquote(parser.metadata))
    except AttributeError:
        print "No more episodes"
        sys.exit(0)
    #print "Got XML" # xml_metadata


    metadata = xml2obj.xml2obj(xml_metadata)
    metadata["fetched_quality"] = quality
    if metadata["sizes"]["hq"] == "0":
        metadata["fetched_quality"] = "default"
        quality = "default"
    metadata["iasid"] = iasid
    metadata["season"] = season
    metadata["number"] = number

    if not os.path.isdir(os.path.dirname(file_base)):
        os.mkdir(os.path.dirname(file_base))

    if not os.path.isfile(file_base+".meta") or os.stat(file_base+".meta").st_size == 0:
        fd = open(file_base+".meta", "w")
        fd.write(json.dumps(metadata))
        fd.close()
    metadata.update({'bitrate': float(metadata["sizes"][quality])/float(metadata['duration'])})
    #print "bitrate: %s byte/sec" % metadata['bitrate']
    return metadata, file_base
Example #13
0
def unwatchEpisode(eid):
    postdata = { "watch": 0, "eid": eid }
    return GetPage.getpage(config.watchUrl, postdata)
Example #14
0
def unwatchEpisode(eid):
    postdata = {"watch": 0, "eid": eid}
    return GetPage.getpage(config.watchUrl, postdata)