Example #1
0
def get_service_search_results(service_file):
    articles = []
    with open(service_file, 'r') as services:
        for line in services.readlines():
            line = line.strip()
            # 去除注释和空行
            if line.startswith('#') or len(line) == 0:
                continue
            query = line.split()

            # 对爬取出错的再尝试一次,两次出错放弃
            if len(query) == 2:
                try:
                    articles.extend(get_service_search_page(
                        query[0], query[1]))
                except Exception as e:
                    print('Exception occured, retried with %s' % query[1])
                    try:
                        articles.extend(
                            get_service_search_page(query[0], query[1]))
                    except Exception as e:
                        print('%s Exception occured agagin!' % query[1])
                        continue

            elif len(query) == 3:
                try:
                    articles.extend(
                        get_service_search_page(query[0], query[1],
                                                int(query[2])))
                except Exception as e:
                    print('Exception occured, retried with %s' % query[1])
                    try:
                        articles.extend(
                            get_service_search_page(query[0], query[1],
                                                    int(query[2])))
                    except Exception as e:
                        print('%s Exception occured agagin!' % query[1])
                        continue

            else:
                print(line + u':输入格式错误!')
                return
            '''
            for article, index in enumerate(articles):
                try:

                    html_info = article_extractor(article['article_url'])
                    articles[index].update(html_info)
                except Exception, e:
                    print e
                    continue
            '''

    articles = remove_dup(articles)
    # query mongodb and remove documnets already stored
    # articles = article_to_mongodb('weixin', 'service', articles)
    article_to_file('service', articles)
Example #2
0
def get_keyword_search_results(keywords_file):
    articles = []
    with open(keywords_file, 'r') as keywords:
        for line in keywords.readlines():
            line = line.strip()
            # 去除注释和空行
            if line.startswith('#') or len(line) == 0:
                continue
            query = line.split()

            # 对爬取出错的再尝试一次,两次出错放弃
            if len(query) == 1:
                try:
                    articles.extend(get_keyword_search_page(query[0]))
                except Exception as e:
                    print(str(e))
                    print('Exception occured, retried with %s' % query[0])
                    try:
                        articles.extend(get_keyword_search_page(query[0]))
                    except Exception as e:
                        print('%s Exception occured agagin!' % query[0])
                        continue

            elif len(query) == 2:
                try:
                    articles.extend(
                        get_keyword_search_page(query[0], int(query[1])))
                except Exception as e:
                    print(str(e))
                    print('Exception occured, retried with %s' % query[0])
                    try:
                        articles.extend(
                            get_keyword_search_page(query[0], int(query[1])))
                    except Exception as e:
                        print('%s Exception occured agagin!' % query[0])
                        continue

            else:
                print(line + u':输入格式错误!')
                return
            '''
            for index, article in enumerate(articles):
                print('line 53, get html_info begin')
                try:
                    html_info = article_extractor(article['article_url'])
                    articles[index].update(html_info)
                except Exception, e:
                    print e
                    continue
                print('line 56, get html_info begin')
            '''
    articles = remove_dup(articles)
    # query mongodb and remove documnets already stored
    # articles = article_to_mongodb('weixin', 'sougou', articles)
    article_to_file('keywords', articles)
Example #3
0
def get_keyword_search_results(keywords_file):
    articles = []
    with open(keywords_file, 'r') as keywords:
        for line in keywords.readlines():
            line = line.strip()
            # 去除注释和空行
            if line.startswith('#') or len(line) == 0:
                continue
            query = line.split()

            # 对爬取出错的再尝试一次,两次出错放弃
            if len(query) == 1:
                try:
                    articles.extend(get_keyword_search_page(query[0]))
                except Exception as e:
                    print(str(e))
                    print('Exception occured, retried with %s' % query[0])
                    try:
                        articles.extend(get_keyword_search_page(query[0]))
                    except Exception as e:
                        print('%s Exception occured agagin!' % query[0])
                        continue

            elif len(query) == 2:
                try:
                    articles.extend(get_keyword_search_page(query[0], int(query[1])))
                except Exception as e:
                    print(str(e))
                    print('Exception occured, retried with %s' % query[0])
                    try:
                        articles.extend(get_keyword_search_page(query[0], int(query[1])))
                    except Exception as e:
                        print('%s Exception occured agagin!' % query[0])
                        continue

            else:
                print(line + u':输入格式错误!')
                return

            '''
            for index, article in enumerate(articles):
                print('line 53, get html_info begin')
                try:
                    html_info = article_extractor(article['article_url'])
                    articles[index].update(html_info)
                except Exception, e:
                    print e
                    continue
                print('line 56, get html_info begin')
            '''
    articles = remove_dup(articles)
    # query mongodb and remove documnets already stored
    # articles = article_to_mongodb('weixin', 'sougou', articles)
    article_to_file('keywords', articles)
Example #4
0
def get_service_search_results(service_file):
    articles = []
    with open(service_file, 'r') as services:
        for line in services.readlines():
            line = line.strip()
            # 去除注释和空行
            if line.startswith('#') or len(line) == 0:
                continue
            query = line.split()

            # 对爬取出错的再尝试一次,两次出错放弃
            if len(query) == 2:
                try:
                    articles.extend(get_service_search_page(query[0], query[1]))
                except Exception as e:
                    print('Exception occured, retried with %s' % query[1])
                    try:
                        articles.extend(get_service_search_page(query[0], query[1]))
                    except Exception as e:
                        print('%s Exception occured agagin!' % query[1])
                        continue

            elif len(query) == 3:
                try:
                    articles.extend(get_service_search_page(query[0], query[1], int(query[2])))
                except Exception as e:
                    print('Exception occured, retried with %s' % query[1])
                    try:
                        articles.extend(get_service_search_page(query[0], query[1], int(query[2])))
                    except Exception as e:
                        print('%s Exception occured agagin!' % query[1])
                        continue

            else:
                print(line + u':输入格式错误!')
                return

            '''
            for article, index in enumerate(articles):
                try:

                    html_info = article_extractor(article['article_url'])
                    articles[index].update(html_info)
                except Exception, e:
                    print e
                    continue
            '''

    articles = remove_dup(articles)
    # query mongodb and remove documnets already stored
    # articles = article_to_mongodb('weixin', 'service', articles)
    article_to_file('service', articles)