def get_plain_text(self, url):
     text = ''
     try:
         page_content = WebHelper.get_page_content_from_url(url)
         if page_content is None:
             print('[Error]', url)
             return ''
         page_content = page_content.decode('utf-8')
         soup = BeautifulSoup(page_content, 'lxml')
         # kill all script and style elements
         for script in soup(["script", "style"]):
             script.extract()
         text = soup.get_text()
         # break into lines and remove leading and trailing space on each
         lines = (' '.join(line.strip().split())
                  for line in text.splitlines())
         text = '\n'.join(lines)
         text = os.linesep.join([s for s in text.splitlines() if s])
         time.sleep(random.randint(1, 3))
     except Exception as e:
         if isinstance(e, KeyboardInterrupt):
             exit()
         else:
             print(e)
     return text
Ejemplo n.º 2
0
 def get_search_page_by_name(cls, name):
     """
     get html content of the search page as a bing_result of the given name
     :param name: name to be searched on search engine
     :return: html content of search page
     """
     name = str(name).replace(' ', '+')
     search_url = cls.__SEARCH_ROOT_URL__ + name
     return WebHelper.get_page_content_from_url(search_url)
 def get_search_page_by_name(cls, name):
     """
     get html content of the search page as a bing_result of the given name
     :param name: name to be searched on search engine
     :return: html content of search page
     """
     name = str(name).replace(' ', '+')
     search_url = cls.__SEARCH_ROOT_URL__ + name
     return WebHelper.get_page_content_from_url(search_url)
Ejemplo n.º 4
0
 def get_next_level_url(self):
     # compute next level url, set prompt or next_level_url
     self.do_compute()
     # get next level url
     if self._next_level_url is not None:
         return self._next_level_url
     if self._prompt is not None:
         self._next_level_url = WebHelper.join_url(self.url, self._prompt)
         return self._next_level_url
     else:
         print "do_compute should set at least one value of [prompt, next_level_url]"
         return self.url
Ejemplo n.º 5
0
    def __init__(self, url_or_result, need_authentication=False, user_name='huge', pass_word='file'):
        if type(url_or_result) is Result:
            self.url = url_or_result.url
            self.user = url_or_result.user
            self.password = url_or_result.password
            self.file_name = url_or_result.file
            if self.user is not None and self.password is not None:
                need_authentication = True
            self.result = url_or_result
        else:
            self.url = url_or_result
            self.user = user_name
            self.password = pass_word
            self.result = Result()

        if need_authentication:
            self.url, self.web_source = WebHelper.get_auth_url_content(self.url, self.user, self.password)
        else:
            self.url, self.web_source = WebHelper.get_final_url_content(self.url)
        self._prompt = None
        self._next_level_url = None
Ejemplo n.º 6
0
    def meet_row_requirement(self, r):
        row = [self.table[i][r] for i in range(self.width)]
        row_filled = []
        total = 0
        for i in range(self.width):
            if row[i] == 1:
                total += 1
            elif total != 0:
                row_filled.append(total)
                total = 0
        if total != 0:
            row_filled.append(total)
        if row_filled == self.column_bar[r]:
            return True
        else:
            # print "Row %i from table: %s" % (r, str(row))
            # print "Required: %s, Get: %s" % (str(self.vertical[r]), row_filled)
            return False

    def print_result(self):
        for i in range(self.height):
            print ''.join(map(lambda x: self.filled if x == 1 else self.unfilled,
                              [self.table[j][i] for j in range(self.width)]))


if __name__ == '__main__':
    file_web_url = 'http://www.pythonchallenge.com/pc/rock/warmup.txt'
    content = WebHelper.get_auth_web_source(file_web_url, 'kohsamui', 'thailand')
    sketch = Sketch(content)
    sketch.play_game()
Ejemplo n.º 7
0
    __RESULT_DIR_PATH__ = '../google_result/'
    __SEARCH_ROOT_URL__ = 'https://www.google.com/search?hl=en&safe=off&q='


class BingHelper(SearchHelper):
    __parser__ = BingPageHTMLParser
    __RESULT_DIR_PATH__ = '../bing_result/'
    __SEARCH_ROOT_URL__ = 'https://cn.bing.com/search?q='


if __name__ == '__main__':
    # bing_result = GoogleHelper.get_google_search_page_by_name('jie tang mail')
    # resultFile = open('bing_result.html', 'w')
    # resultFile.write(bing_result)
    #
    # title_url_dict = GoogleHelper.get_google_items_from_search_page(bing_result)
    # for url, title in title_url_dict:
    #     print url, title

    content = WebHelper.get_page_content_from_url(
        'http://www.google.com/search?q=jie+tang+tsinghua+email')
    result = open('bing_result.html', 'w')
    result.write(content)

    # proxy = urllib2.ProxyHandler({'http': 'http://*****:*****@tel.lc.ignw.net:25'})
    # auth = urllib2.HTTPBasicAuthHandler()
    # opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler)
    # urllib2.install_opener(opener)
    # print 'ready to open'
    # conn = urllib2.urlopen('http://www.google.com')
    # print conn.read()
Ejemplo n.º 8
0
 def __init__(self):
     self.api = APIHelper()
     self.data = DataManager()
     self.web = WebHelper()
Ejemplo n.º 9
0
class Fetcher(object):
    """API fetcher to complete tasks"""
    def __init__(self):
        self.api = APIHelper()
        self.data = DataManager()
        self.web = WebHelper()

    def fetch_single_topic(self, topic_id):
        topic_info = self.api.get_topic_info(topic_id)
        partial_member = self.data.member_of_topic(topic_info)
        stored_topic = self.data.find_topic(topic_id)
        new_topic = self.data.handle_topic(topic_info, topic_id)
        if stored_topic:
            new_topic['web_crawled'] = stored_topic['web_crawled']
        else:
            new_topic['web_crawled'] = datetime.fromtimestamp(0)
            new_topic['click'] = 0
            new_topic['favorite'] = 0
            new_topic['thank'] = 0

        self.data.update_topic_synced_state(new_topic['id'], False)
        self.data.upsert_topic(new_topic)
        self.data.upsert_member(partial_member)

    def fetch_new_topics(self):
        max_stored_topic_id = self.data.max_stored_topic_id
        topic_count = self.api.get_topic_count()
        if max_stored_topic_id >= topic_count:
            return
        for topic_id in range(max_stored_topic_id + 1, topic_count + 1):
            self.fetch_single_topic(topic_id)

    def fetch_replies_of_topic(self, topic_id):
        replies = self.api.get_replies(topic_id)
        if replies:
            self.data.update_topic_synced_state(topic_id, False)
            for reply in replies:
                partial_member = self.data.member_of_reply(reply)
                self.data.upsert_reply(self.data.handle_reply(reply, topic_id))
                self.data.upsert_member(partial_member)

    def fetch_new_replies(self):
        max_stored_topic_id = self.data.max_stored_topic_id
        max_stored_topic_id_of_reply = self.data.max_stored_topic_id_of_reply
        if max_stored_topic_id_of_reply == 0:
            need_refetch_max_topic = False
        else:
            topic = self.data.find_topic(max_stored_topic_id_of_reply)
            replies = self.data.find_all_replies(max_stored_topic_id_of_reply)
            need_refetch_max_topic = not (replies.count() == topic['replies'])
        for topic_id in range(
                max_stored_topic_id_of_reply + 1 - int(need_refetch_max_topic),
                max_stored_topic_id + 1):
            if self.data.find_topic(topic_id=topic_id):
                self.fetch_replies_of_topic(topic_id)

    def fetch_single_topic_extras(self, topic_id):
        def upsert_counts(click, favorite, thank):
            self.data.upsert_topic_extras(topic_id, click, favorite, thank)
            self.data.update_topic_synced_state(topic_id, False)
            count = self.data.update_topic_web_crawled(topic_id,
                                                       datetime.utcnow())
            logging.info('Update topic {0} extras, count {1}'.format(
                topic_id, count))

        web_extra = self.web.get_topic_extras(topic_id)
        if not web_extra:
            count = self.data.update_topic_web_crawled(topic_id,
                                                       datetime.utcnow())
            topic = self.data.find_topic(topic_id)
            if (not topic) or ('click' not in topic):
                upsert_counts(0, 0, 0)
            logging.info('Topic {0} extras is None, count {1}'.format(
                topic_id, count))
            return
        for index, postscript in enumerate(web_extra.subtle_list):
            postscript = self.data.handle_postscript(postscript, topic_id,
                                                     index + 1)
            self.data.upsert_postscript(postscript)

        upsert_counts(web_extra.click, web_extra.favorite, web_extra.thank)

    def fetch_all_topic_extras(self):
        while True:
            min_topic_id_need_postscript = self.data.min_topic_id_need_extras
            if min_topic_id_need_postscript > 0:
                try:
                    self.fetch_single_topic_extras(
                        min_topic_id_need_postscript)
                except ElasticsearchException as es_error:
                    raise es_error
                except Exception as e:
                    logging.error('Fetch single topic extras error: ' + str(e))
            else:
                break

    def fetch_all_nodes(self):
        nodes = self.api.get_all_nodes()
        if nodes:
            for node in nodes:
                node['crawled'] = datetime.utcnow()
                self.data.upsert_node(node)

        logging.info('Fetching all node, count: ' + str(len(nodes)))

    def fetch_new_members(self):
        site_stats = self.api.get_site_stats()
        max_stored_member_id = self.data.max_stored_member_id
        if site_stats:
            member_max = site_stats['member_max']
            for member_id in range(max_stored_member_id + 1, member_max + 1):
                new_member = self.api.get_member_info(member_id=member_id)
                if not new_member:
                    continue
                new_member['crawled'] = datetime.utcnow()
                self.data.upsert_member(new_member)
                logging.info('Upsert member {0}, id {1}'.format(
                    new_member['username'], new_member['id']))
        else:
            new_member = self.api.get_member_info(
                member_id=self.data.max_stored_member_id + 1)
            while new_member:
                new_member['crawled'] = datetime.utcnow()
                self.data.upsert_member(new_member)
                logging.info('Upsert member {0}, id {1}'.format(
                    new_member['username'], new_member['id']))
                new_member = self.api.get_member_info(
                    member_id=self.data.max_stored_member_id + 1)

    def fetch_stale_topics(self):
        stale_topics = self.data.stale_topics()
        if not stale_topics:
            return
        for topic in stale_topics:
            topic_id = topic['id']
            self.fetch_single_topic(topic_id)
            if topic['web_crawled'] and topic[
                    'recrawl'] and topic['web_crawled'] < topic['recrawl']:
                self.fetch_single_topic_extras(topic_id)
            self.fetch_replies_of_topic(topic_id)

    def sync_topic_to_es(self):
        waiting_topics = self.data.not_synced_topics()
        if not waiting_topics:
            return
        for topic in waiting_topics:
            self.data.update_topic_synced_state(topic['id'], True)
            self.data.es_update_assembled_topic(topic)
Ejemplo n.º 10
0
 def create_image_from_web(img_url, user=None, password=None):
     if user is None or password is None:
         img_data = WebHelper.get_web_source(img_url)
     else:
         img_data = WebHelper.get_auth_web_source(img_url, user, password)
     return Image.open(StringIO(img_data))
Ejemplo n.º 11
0
 def show_image_from_web(img_url, user=None, password=None):
     if user is None or password is None:
         img_data = WebHelper.get_auth_web_source(img_url)
     else:
         img_data = WebHelper.get_auth_web_source(img_url, user, password)
     ImageHelper.show_image_from_data(img_data)
    __parser__ = GooglePageHTMLParser
    __RESULT_DIR_PATH__ = '../google_result/'
    __SEARCH_ROOT_URL__ = 'https://www.google.com/search?hl=en&safe=off&q='


class BingHelper(SearchHelper):
    __parser__ = BingPageHTMLParser
    __RESULT_DIR_PATH__ = '../bing_result/'
    __SEARCH_ROOT_URL__ = 'https://cn.bing.com/search?q='


if __name__ == '__main__':
    # bing_result = GoogleHelper.get_google_search_page_by_name('jie tang mail')
    # resultFile = open('bing_result.html', 'w')
    # resultFile.write(bing_result)
    #
    # title_url_dict = GoogleHelper.get_google_items_from_search_page(bing_result)
    # for url, title in title_url_dict:
    #     print url, title

    content = WebHelper.get_page_content_from_url('http://www.google.com/search?q=jie+tang+tsinghua+email')
    result = open('bing_result.html', 'w')
    result.write(content)

    # proxy = urllib2.ProxyHandler({'http': 'http://*****:*****@tel.lc.ignw.net:25'})
    # auth = urllib2.HTTPBasicAuthHandler()
    # opener = urllib2.build_opener(proxy, auth, urllib2.HTTPHandler)
    # urllib2.install_opener(opener)
    # print 'ready to open'
    # conn = urllib2.urlopen('http://www.google.com')
    # print conn.read()