Exemple #1
0
class Main():
    def __init__(self, link, posts, handbook):
        self.file = File()
        self.log = Log('sites')

        self.link = link
        self.posts = posts
        self.handbook = handbook

    #def posts(self):
    #    return self.posts

    def read_file(self):
        with self.file.read('page', 'html') as input_file:
            text = input_file.read()

        return text

    '''
    type - type of tag(id or class)
    value - value of tag
    '''

    def get_menu(self, type, value, inner='', span=False):
        soup = self.soup()

        if inner != '':
            links = soup.find(inner, {type: value}).find('ul')
        else:
            links = soup.find('ul', {type: value})

        if not links:
            raise RuntimeError("structure of the site menu has changed")

        pages = []
        titles = []

        for item in links.find_all('a'):
            title = self.clear_title(item, span)

            if title in self.menu and not title in titles:
                titles.append(title)

                pages.append({
                    'title': title,
                    'url': self.check_url(item.get('href'))
                })

        return pages

    def set_file(self, url):
        self.file.set_file(url, 'sites')

    def soup(self):
        text = self.read_file()

        return BeautifulSoup(text, 'html.parser')

    def check_date(self, date, is_timestump=False, format=None):
        if format is None:
            date = date[:19]
            format = "%Y-%m-%dT%H:%M:%S"

        try:
            if not is_timestump:
                date = time.mktime(datetime.strptime(date, format).timetuple())

            day_ago = datetime.today() - timedelta(days=1)

            if (float(date) < day_ago.timestamp()):
                return None
        except ValueError:
            raise RuntimeError("structure date has changed")

        return date

    def get_posts(self):
        return self.result

    def get_titles(self):
        return self.posts

    def clear(self, text):
        try:
            myre = re.compile(
                u"[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]+",
                re.UNICODE)
        except re.error:
            myre = re.compile(
                u"(\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF])+",
                re.UNICODE)

        return myre.sub(r'', text.replace("\xa0", " ")).strip()

    def change_date(self, date, format="%b %d, %Y"):
        number, type, ago = date.split(' ')

        if type == 'HOURS' or type == 'hours':
            date = datetime.now() - timedelta(hours=int(number))
        elif type == 'MINUTES' or type == 'minutes' or type == 'mins':
            date = datetime.now() - timedelta(minutes=int(number))

        return date.strftime(format)

    def clear_title(self, point, clear):
        spans = point.find_all('span')

        if spans and not clear:
            for span in spans:
                span.extract()

        return point.text.strip()

    def check_url(self, url):
        return self.link + url.replace(self.link, '').lstrip('/')

    def check_handbook_post(self, title, text):
        check = []

        for h in self.handbook:
            if self.handbook[h]['check'] == 0:
                pattern = re.compile(
                    '(^|\W)' + self.handbook[h]['title'] + '(\W|$)',
                    re.IGNORECASE)
            else:
                pattern = re.compile('(^|\W)' + self.handbook[h]['title'] +
                                     '(\W|$)')

            match_title = re.search(pattern, title)
            match_text = re.search(pattern, text)

            if not match_text is None or not match_title is None:
                check.append(h)

        return check

    def multiple_replacer(self, *key_values):
        replace_dict = dict(key_values)
        replacement_function = lambda match: replace_dict[match.group(0)]
        pattern = re.compile("|".join([re.escape(k) for k, v in key_values]),
                             re.M)
        return lambda string: pattern.sub(replacement_function, string)

    def multiple_replace(self, string, *key_values):
        return self.multiple_replacer(*key_values)(string)
Exemple #2
0
class Scraping(Union):
    def __init__(self):
        self.type = 'sites'

        self.db = Db()
        self.file = File()
        self.log = Log(self.type)

        self.news = []

    #list of sites from db
    def site_list(self):
        return self.db.get_sites()

    # get all posts
    def posts_list(self):
        result = self.db.get_posts('post', self.day_ago())
        posts = []

        for post in result:
            posts.append(post[0])

        return posts

    def handbook_list(self):
        handbooks = {}
        for handbook in self.db.get_handbook():
            handbooks[handbook[0]] = {
                'title': handbook[1],
                'check': handbook[2]
            }

        return handbooks

    def start(self):
        # check import is ready
        if not self.start_import():
            return False

        self.posts = self.posts_list()
        self.handbook = self.handbook_list()

        #print(len(self.posts))

        if self.handbook:
            for site in self.site_list():
                if site[3] > 0:
                    self.scrap(site)
                else:
                    self.log.write("resourse {0} is desabled".format(site[1]))

            #save news to db
            print(len(self.news))
            #print(self.posts)

        self.finish_import()

    def scrap(self, site):
        self.file.set_file(site[1])

        resourse = self.switch(site)
        resourse.start()

        self.news = self.merge(self.news, resourse.get_posts())
        self.posts = resourse.get_titles()

    def switch(self, site):
        x = site[2]

        self.log.write("---\n{0} start at {1}".format(site[1], self.get_day()))

        if x == 'thebitcoinnews':
            return CThebitcoinnews(site[1], self.posts, self.handbook)
        elif x == 'coinjournal':
            return CCoinjournal(site[1], self.posts, self.handbook)
        elif x == 'coindesk':
            return CCoindesk(site[1], self.posts, self.handbook)
        elif x == 'bitcoin':
            return CBitcoin(site[1], self.posts, self.handbook)
        elif x == 'cointelegraph':
            return CCointelegraph(site[1], self.posts, self.handbook)
        elif x == 'bitcoinmagazine':
            return CBitcoinmagazine(site[1], self.posts, self.handbook)
        elif x == 'newsbtc':
            return CNewsbtc(site[1], self.posts, self.handbook)
        elif x == 'forklog':
            return CForklog(site[1], self.posts, self.handbook)
        elif x == 'coinspeaker':
            return CCoinspeaker(site[1], self.posts, self.handbook)
        elif x == 'bitcoinist':
            return CBitcoinist(site[1], self.posts, self.handbook)
        elif x == 'bitcoinertoday':
            return CBitcoinertoday(site[1], self.posts, self.handbook)
        elif x == 'coindoo':
            return CCoindoo(site[1], self.posts, self.handbook)
        elif x == 'trustnodes':
            return CTrustnodes(site[1], self.posts, self.handbook)
        elif x == 'btcmanager':
            return CBtcmanager(site[1], self.posts, self.handbook)
        elif x == 'usethebitcoin':
            return CUsethebitcoin(site[1], self.posts, self.handbook)
        elif x == 'investinblockchain':
            return CInvestinblockchain(site[1], self.posts, self.handbook)
        elif x == 'ethereumworldnews':
            return CEthereumworldnews(site[1], self.posts, self.handbook)
        elif x == 'coinstaker':
            return CCoinstaker(site[1], self.posts, self.handbook)
        elif x == 'livebitcoinnews':
            return CLivebitcoinnews(site[1], self.posts, self.handbook)
        elif x == 'coinsnewbium':
            return CCoinsnewbium(site[1], self.posts, self.handbook)
        elif x == 'ccn':
            return CCcn(site[1], self.posts, self.handbook)
        elif x == 'themerkle':
            return CThemerkle(site[1], self.posts, self.handbook)
        elif x == 'ethnews':
            return CEthnews(site[1], self.posts, self.handbook)
        elif x == 'zycrypto':
            return CZycrypto(site[1], self.posts, self.handbook)
        elif x == 'profitconfidential':
            return CProfitconfidential(site[1], self.posts, self.handbook)
        elif x == 'cryptoanswers':
            return CCryptoanswers(site[1], self.posts, self.handbook)
        elif x == 'bloomberg':
            return CBloomberg(site[1], self.posts, self.handbook)

        return None