class Main(): def __init__(self, link, posts, handbook): self.file = File() self.log = Log('sites') self.link = link self.posts = posts self.handbook = handbook #def posts(self): # return self.posts def read_file(self): with self.file.read('page', 'html') as input_file: text = input_file.read() return text ''' type - type of tag(id or class) value - value of tag ''' def get_menu(self, type, value, inner='', span=False): soup = self.soup() if inner != '': links = soup.find(inner, {type: value}).find('ul') else: links = soup.find('ul', {type: value}) if not links: raise RuntimeError("structure of the site menu has changed") pages = [] titles = [] for item in links.find_all('a'): title = self.clear_title(item, span) if title in self.menu and not title in titles: titles.append(title) pages.append({ 'title': title, 'url': self.check_url(item.get('href')) }) return pages def set_file(self, url): self.file.set_file(url, 'sites') def soup(self): text = self.read_file() return BeautifulSoup(text, 'html.parser') def check_date(self, date, is_timestump=False, format=None): if format is None: date = date[:19] format = "%Y-%m-%dT%H:%M:%S" try: if not is_timestump: date = time.mktime(datetime.strptime(date, format).timetuple()) day_ago = datetime.today() - timedelta(days=1) if (float(date) < day_ago.timestamp()): return None except ValueError: raise RuntimeError("structure date has changed") return date def get_posts(self): return self.result def get_titles(self): return self.posts def clear(self, text): try: myre = re.compile( u"[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]+", re.UNICODE) except re.error: myre = re.compile( u"(\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF])+", re.UNICODE) return myre.sub(r'', text.replace("\xa0", " ")).strip() def change_date(self, date, format="%b %d, %Y"): number, type, ago = date.split(' ') if type == 'HOURS' or type == 'hours': date = datetime.now() - timedelta(hours=int(number)) elif type == 'MINUTES' or type == 'minutes' or type == 'mins': date = datetime.now() - timedelta(minutes=int(number)) return date.strftime(format) def clear_title(self, point, clear): spans = point.find_all('span') if spans and not clear: for span in spans: span.extract() return point.text.strip() def check_url(self, url): return self.link + url.replace(self.link, '').lstrip('/') def check_handbook_post(self, title, text): check = [] for h in self.handbook: if self.handbook[h]['check'] == 0: pattern = re.compile( '(^|\W)' + self.handbook[h]['title'] + '(\W|$)', re.IGNORECASE) else: pattern = re.compile('(^|\W)' + self.handbook[h]['title'] + '(\W|$)') match_title = re.search(pattern, title) match_text = re.search(pattern, text) if not match_text is None or not match_title is None: check.append(h) return check def multiple_replacer(self, *key_values): replace_dict = dict(key_values) replacement_function = lambda match: replace_dict[match.group(0)] pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M) return lambda string: pattern.sub(replacement_function, string) def multiple_replace(self, string, *key_values): return self.multiple_replacer(*key_values)(string)
class Scraping(Union): def __init__(self): self.type = 'sites' self.db = Db() self.file = File() self.log = Log(self.type) self.news = [] #list of sites from db def site_list(self): return self.db.get_sites() # get all posts def posts_list(self): result = self.db.get_posts('post', self.day_ago()) posts = [] for post in result: posts.append(post[0]) return posts def handbook_list(self): handbooks = {} for handbook in self.db.get_handbook(): handbooks[handbook[0]] = { 'title': handbook[1], 'check': handbook[2] } return handbooks def start(self): # check import is ready if not self.start_import(): return False self.posts = self.posts_list() self.handbook = self.handbook_list() #print(len(self.posts)) if self.handbook: for site in self.site_list(): if site[3] > 0: self.scrap(site) else: self.log.write("resourse {0} is desabled".format(site[1])) #save news to db print(len(self.news)) #print(self.posts) self.finish_import() def scrap(self, site): self.file.set_file(site[1]) resourse = self.switch(site) resourse.start() self.news = self.merge(self.news, resourse.get_posts()) self.posts = resourse.get_titles() def switch(self, site): x = site[2] self.log.write("---\n{0} start at {1}".format(site[1], self.get_day())) if x == 'thebitcoinnews': return CThebitcoinnews(site[1], self.posts, self.handbook) elif x == 'coinjournal': return CCoinjournal(site[1], self.posts, self.handbook) elif x == 'coindesk': return CCoindesk(site[1], self.posts, self.handbook) elif x == 'bitcoin': return CBitcoin(site[1], self.posts, self.handbook) elif x == 'cointelegraph': return CCointelegraph(site[1], self.posts, self.handbook) elif x == 'bitcoinmagazine': return CBitcoinmagazine(site[1], self.posts, self.handbook) elif x == 'newsbtc': return CNewsbtc(site[1], self.posts, self.handbook) elif x == 'forklog': return CForklog(site[1], self.posts, self.handbook) elif x == 'coinspeaker': return CCoinspeaker(site[1], self.posts, self.handbook) elif x == 'bitcoinist': return CBitcoinist(site[1], self.posts, self.handbook) elif x == 'bitcoinertoday': return CBitcoinertoday(site[1], self.posts, self.handbook) elif x == 'coindoo': return CCoindoo(site[1], self.posts, self.handbook) elif x == 'trustnodes': return CTrustnodes(site[1], self.posts, self.handbook) elif x == 'btcmanager': return CBtcmanager(site[1], self.posts, self.handbook) elif x == 'usethebitcoin': return CUsethebitcoin(site[1], self.posts, self.handbook) elif x == 'investinblockchain': return CInvestinblockchain(site[1], self.posts, self.handbook) elif x == 'ethereumworldnews': return CEthereumworldnews(site[1], self.posts, self.handbook) elif x == 'coinstaker': return CCoinstaker(site[1], self.posts, self.handbook) elif x == 'livebitcoinnews': return CLivebitcoinnews(site[1], self.posts, self.handbook) elif x == 'coinsnewbium': return CCoinsnewbium(site[1], self.posts, self.handbook) elif x == 'ccn': return CCcn(site[1], self.posts, self.handbook) elif x == 'themerkle': return CThemerkle(site[1], self.posts, self.handbook) elif x == 'ethnews': return CEthnews(site[1], self.posts, self.handbook) elif x == 'zycrypto': return CZycrypto(site[1], self.posts, self.handbook) elif x == 'profitconfidential': return CProfitconfidential(site[1], self.posts, self.handbook) elif x == 'cryptoanswers': return CCryptoanswers(site[1], self.posts, self.handbook) elif x == 'bloomberg': return CBloomberg(site[1], self.posts, self.handbook) return None