def __init__(self, link, posts, handbook): self.file = File() self.log = Log('sites') self.link = link self.posts = posts self.handbook = handbook
def save(self, dirTarget,withThumb=False,streamMode=StreamMode.Exodus): """ save the movie in .strm format for GOmovies addon """ name = '{0} ({1})'.format( self.nameEn.lower()[:40] , self.year ) base = File.cleanName(name) dirTarget = os.path.join( dirTarget , base ) info = os.path.join( dirTarget, '{0}.{1}'.format( base , 'nfo' ) ) File.ensureFolder(dirTarget) links = [] if streamMode == StreamMode.Exodus: links.append([name,self.asExodus()]) if streamMode == StreamMode.Pulsar: links.append([name,self.asPulsar()]) if streamMode == StreamMode.Salts: links.append([name,self.asSalts()]) if streamMode == StreamMode.Quasar: links.append([name,self.asQuasar()]) strm = os.path.join( dirTarget, '{0}.{1}'.format( base , 'strm' ) ) with codecs.open(strm,'w') as ff: ff.write("#EXTM3U\n") for xx in links: ff.write(u'#EXTINF:{0},{1}\n'.format(0,xx[0] )) ff.write(u'{0}\n'.format(xx[1])) if withThumb: thumb = os.path.join( dirTarget, 'folder.jpg'.format( base , 'jpg' )) if os.path.exists(thumb) == False: FileLoader.load(self.imdbInfo.get('Poster'), '@' + thumb ) if os.path.exists(info): return False with codecs.open(info,'w') as ff: ff.write('http://www.imdb.com/title/{0}'.format(self.idImdb)) return True
def __init__(self): self.type = 'sites' self.db = Db() self.file = File() self.log = Log(self.type) self.news = []
def initPaths(self): self.dirRoot = self.path self.dirSource = os.path.join( self.dirRoot, 'Feeder' ) self.cache = os.path.join( self.data , "Cache" ) self.dirLogs = os.path.join( self.data , "Logs" ) File.ensureFolder(self.cache ) File.ensureFolder(self.dirLogs) Log.log = self.log FileLoader.dirCache = os.path.join( self.data, "Cache" )
def initPaths(self): self.dirRoot = self.path self.cache = os.path.join( self.data , "Cache" ) self.dirLogs = os.path.join( self.data , "Logs" ) File.ensureFolder(self.cache ) File.ensureFolder(self.dirLogs) FileLoader.dirCache = os.path.join( self.data, "Cache" ) SeriesInfo.tvdbMoldDir = os.path.join( self.dirRoot , "html" ) Log.log = self.log
def save(self, dirTarget, langid=7, withThumb=False, streamMode=StreamMode.Exodus): added = 0 episodes = 0 base = File.cleanName(u"{0} ({1})".format(self.name.lower(), self.year)) dirSeries = os.path.join(dirTarget, base) self.saveTvdbInfo() if os.path.exists(dirSeries) == False: added += 1 self.saveTvshow(dirSeries, langid=langid) episodes = 0 for episode in self.episodes: try: if episode.season == 0: continue if episode.name == "TBA": continue dirSeason = os.path.join( dirSeries, File.cleanName(u"{0}.S{1:02}".format(self.findNameClean(), episode.season)) ) self.saveTvshow(dirSeason, langid=langid) name = u"{3}.S{0:02}E{1:02}.{2}".format(episode.season, episode.episode, episode.name, self.name) part0 = dirSeason part1 = self.name part2 = ".S{0:02}E{1:02}.".format(episode.season, episode.episode) part3 = episode.name[: 256 - -len(part0) - len(part1) - len(part2) - 10] base = File.cleanName(u"{0}{1}{2}".format(part1, part2, part3)) links = [] if streamMode == StreamMode.Exodus: links.append([name, episode.asExodus()]) if streamMode == StreamMode.Pulsar: links.append([name, episode.asPulsar()]) if streamMode == StreamMode.Salts: links.append([name, episode.asSalts()]) if streamMode == StreamMode.Quasar: links.append([name, episode.asQuasar()]) strm = os.path.join(dirSeason, u"{0}.{1}".format(base, "strm")) if os.path.exists(strm) == False: episodes += 1 with codecs.open(strm, "w") as ff: ff.write("#EXTM3U\n") for xx in links: ff.write(u"#EXTINF:{0},{1}\n".format(0, xx[0])) ff.write(u"{0}\n".format(xx[1])) except Exception, ee: Log.log(Text.formatException(ee))
class Log(): def __init__(self, filename): self.filename = filename self.file = File() def begin(self): self.start_time = time.time() self.write('--- start at {0} ---'.format( str(datetime.today() + timedelta(hours=2)))) def finish(self): self.write("--- finish at {0}, duration: {1} ---".format( str(datetime.today() + timedelta(hours=2)), self.duration())) def write(self, msg): f = self.file.write_a(self.filename, 'log') f.write(msg + '\n') f.close() #output to the console print(msg) def duration(self): result = time.time() - self.start_time if result > 60: result = round(result / 60, 4) per = 'min' else: result = round(result, 4) per = 'sec' return "{0} {1}".format(result, per)
def library(self): library = self.config.getStr('library') if library == '': library = os.path.join( self.data , "Library" ) SeriesInfo.tvdbDataDir = os.path.join( library , ".tvdb" ) SeriesInfo.tvdbHtmlDir = os.path.join( library , ".html" ) File.ensureFolder(library) File.ensureFolder(SeriesInfo.tvdbDataDir) File.ensureFolder(SeriesInfo.tvdbHtmlDir) return library
class Scraping(Union): def __init__(self): self.type = 'sites' self.db = Db() self.file = File() self.log = Log(self.type) self.news = [] #list of sites from db def site_list(self): return self.db.get_sites() # get all posts def posts_list(self): result = self.db.get_posts('post', self.day_ago()) posts = [] for post in result: posts.append(post[0]) return posts def handbook_list(self): handbooks = {} for handbook in self.db.get_handbook(): handbooks[handbook[0]] = { 'title': handbook[1], 'check': handbook[2] } return handbooks def start(self): # check import is ready if not self.start_import(): return False self.posts = self.posts_list() self.handbook = self.handbook_list() #print(len(self.posts)) if self.handbook: for site in self.site_list(): if site[3] > 0: self.scrap(site) else: self.log.write("resourse {0} is desabled".format(site[1])) #save news to db print(len(self.news)) #print(self.posts) self.finish_import() def scrap(self, site): self.file.set_file(site[1]) resourse = self.switch(site) resourse.start() self.news = self.merge(self.news, resourse.get_posts()) self.posts = resourse.get_titles() def switch(self, site): x = site[2] self.log.write("---\n{0} start at {1}".format(site[1], self.get_day())) if x == 'thebitcoinnews': return CThebitcoinnews(site[1], self.posts, self.handbook) elif x == 'coinjournal': return CCoinjournal(site[1], self.posts, self.handbook) elif x == 'coindesk': return CCoindesk(site[1], self.posts, self.handbook) elif x == 'bitcoin': return CBitcoin(site[1], self.posts, self.handbook) elif x == 'cointelegraph': return CCointelegraph(site[1], self.posts, self.handbook) elif x == 'bitcoinmagazine': return CBitcoinmagazine(site[1], self.posts, self.handbook) elif x == 'newsbtc': return CNewsbtc(site[1], self.posts, self.handbook) elif x == 'forklog': return CForklog(site[1], self.posts, self.handbook) elif x == 'coinspeaker': return CCoinspeaker(site[1], self.posts, self.handbook) elif x == 'bitcoinist': return CBitcoinist(site[1], self.posts, self.handbook) elif x == 'bitcoinertoday': return CBitcoinertoday(site[1], self.posts, self.handbook) elif x == 'coindoo': return CCoindoo(site[1], self.posts, self.handbook) elif x == 'trustnodes': return CTrustnodes(site[1], self.posts, self.handbook) elif x == 'btcmanager': return CBtcmanager(site[1], self.posts, self.handbook) elif x == 'usethebitcoin': return CUsethebitcoin(site[1], self.posts, self.handbook) elif x == 'investinblockchain': return CInvestinblockchain(site[1], self.posts, self.handbook) elif x == 'ethereumworldnews': return CEthereumworldnews(site[1], self.posts, self.handbook) elif x == 'coinstaker': return CCoinstaker(site[1], self.posts, self.handbook) elif x == 'livebitcoinnews': return CLivebitcoinnews(site[1], self.posts, self.handbook) elif x == 'coinsnewbium': return CCoinsnewbium(site[1], self.posts, self.handbook) elif x == 'ccn': return CCcn(site[1], self.posts, self.handbook) elif x == 'themerkle': return CThemerkle(site[1], self.posts, self.handbook) elif x == 'ethnews': return CEthnews(site[1], self.posts, self.handbook) elif x == 'zycrypto': return CZycrypto(site[1], self.posts, self.handbook) elif x == 'profitconfidential': return CProfitconfidential(site[1], self.posts, self.handbook) elif x == 'cryptoanswers': return CCryptoanswers(site[1], self.posts, self.handbook) elif x == 'bloomberg': return CBloomberg(site[1], self.posts, self.handbook) return None
class Main(): def __init__(self, link, posts, handbook): self.file = File() self.log = Log('sites') self.link = link self.posts = posts self.handbook = handbook #def posts(self): # return self.posts def read_file(self): with self.file.read('page', 'html') as input_file: text = input_file.read() return text ''' type - type of tag(id or class) value - value of tag ''' def get_menu(self, type, value, inner='', span=False): soup = self.soup() if inner != '': links = soup.find(inner, {type: value}).find('ul') else: links = soup.find('ul', {type: value}) if not links: raise RuntimeError("structure of the site menu has changed") pages = [] titles = [] for item in links.find_all('a'): title = self.clear_title(item, span) if title in self.menu and not title in titles: titles.append(title) pages.append({ 'title': title, 'url': self.check_url(item.get('href')) }) return pages def set_file(self, url): self.file.set_file(url, 'sites') def soup(self): text = self.read_file() return BeautifulSoup(text, 'html.parser') def check_date(self, date, is_timestump=False, format=None): if format is None: date = date[:19] format = "%Y-%m-%dT%H:%M:%S" try: if not is_timestump: date = time.mktime(datetime.strptime(date, format).timetuple()) day_ago = datetime.today() - timedelta(days=1) if (float(date) < day_ago.timestamp()): return None except ValueError: raise RuntimeError("structure date has changed") return date def get_posts(self): return self.result def get_titles(self): return self.posts def clear(self, text): try: myre = re.compile( u"[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]+", re.UNICODE) except re.error: myre = re.compile( u"(\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF])+", re.UNICODE) return myre.sub(r'', text.replace("\xa0", " ")).strip() def change_date(self, date, format="%b %d, %Y"): number, type, ago = date.split(' ') if type == 'HOURS' or type == 'hours': date = datetime.now() - timedelta(hours=int(number)) elif type == 'MINUTES' or type == 'minutes' or type == 'mins': date = datetime.now() - timedelta(minutes=int(number)) return date.strftime(format) def clear_title(self, point, clear): spans = point.find_all('span') if spans and not clear: for span in spans: span.extract() return point.text.strip() def check_url(self, url): return self.link + url.replace(self.link, '').lstrip('/') def check_handbook_post(self, title, text): check = [] for h in self.handbook: if self.handbook[h]['check'] == 0: pattern = re.compile( '(^|\W)' + self.handbook[h]['title'] + '(\W|$)', re.IGNORECASE) else: pattern = re.compile('(^|\W)' + self.handbook[h]['title'] + '(\W|$)') match_title = re.search(pattern, title) match_text = re.search(pattern, text) if not match_text is None or not match_title is None: check.append(h) return check def multiple_replacer(self, *key_values): replace_dict = dict(key_values) replacement_function = lambda match: replace_dict[match.group(0)] pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M) return lambda string: pattern.sub(replacement_function, string) def multiple_replace(self, string, *key_values): return self.multiple_replacer(*key_values)(string)
def cleanLogs(self): File.delFiles(self.dirLogs) self.notify( sm.str(sm.cleanLogs) )
def cleanCache(self): File.delFiles(self.cache) self.notify( sm.str(sm.cleanCache) )
def cleanLibrary(self): File.delFiles(self.library) self.cleanSource() self.notify( sm.str(sm.cleanLibrary) )
def saveTvshow(self, dirTarget, langid=7): File.ensureFolder(dirTarget) with file(os.path.join(dirTarget, "tvshow.nfo"), "w") as ff: ff.write( "http://thetvdb.com/?{0}={1}&{2}={3}&{4}={5}".format("tab", "series", "id", self.tvdbId, "lid", langid) )
def findNameClean(self): name = self.name if self.name != None else "unknown" return File.cleanName(self.name)[:50].lower()
def __init__(self, filename): self.filename = filename self.file = File()
def library(self): library = self.config.getStr('library') if library == '': library = os.path.join( self.data , "Library" ) File.ensureFolder(library) return library