Beispiel #1
0
    def post(self):
        """Post urls for extracting content (note: do not save the result)"""
        result = {'error': False, 'message': ''}
        urls = request.values.get('urls', '')

        urls = [u.strip().lower() for u in urls.split(',') if u]
        if not urls:
            result['error'] = True
            result['message'] = 'Urls is empty'
            return result

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result['error'] = True
            result[
                'message'] = "The extractor name '%s' does not support yet" % extractor_name
            return result

        # append urls that missing schema
        for idx, url in enumerate(urls):
            if not url.startswith('http'):
                urls[idx] = 'http://' + url

        s_crawler = PageCrawler()
        s_content_getter = ContentGetter(crawler=s_crawler,
                                         extractor=s_extractor)
        result['pages'] = s_content_getter.process(urls)
        return result
def crawl_pages(input_file, output_file):
    logger.info('Start processing input %s...' % input_file)
    with open(input_file, 'r') as f:
        list_url = [
            re.sub(r'\n', '', u.strip())
            for u in random.sample(f.readlines(), 1000)
        ]

    page_crawler = PageCrawler()
    page_extractor = DragnetPageExtractor()
    content_getter = ContentGetter(page_crawler, page_extractor)
    result = content_getter.process(list_url)
    with open(output_file, 'w') as f:
        data = json.dumps(result, f).encode('utf-8', errors='ignore')
        f.write(data)

    logger.info('End processing input %s...' % input_file)
Beispiel #3
0
URL_TYPE_WEB = 'Web'
URL_TYPE_NEWS = 'News'

crawler = PageCrawler()
extractor = GooseDragnetPageExtractor()
content_getter = ContentGetter(crawler=crawler, extractor=extractor)

if __name__ == '__main__':
    url_file = 'data/top_10_ranking_keywords.xlsx'
    df = pd.read_excel(url_file)
    urls = set()
    for idx, row in df.iterrows():
        url = row[FIELD_URL]
        urls.add(url)
        # if idx == 5:
        #     break

    url_page_contents = content_getter.process(urls)
    for idx, row in df.iterrows():
        url = row[FIELD_URL]
        crawled_page = url_page_contents.get(url)
        # if not crawled_page:
        #     continue
        df.loc[idx, FIELD_URL_PAGE_CONTENT] = crawled_page['content']
        if crawled_page['error']:
            df.loc[idx, FIELD_URL_CRAWL_STATUS] = crawled_page['error']
        else:
            df.loc[idx, FIELD_URL_CRAWL_STATUS] = 'Crawl successfully'

    df.to_excel(url_file[:-5] + '.out.xlsx', index=False, encoding='utf-8')
Beispiel #4
0
class MyTestCase(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(MyTestCase, self).__init__(*args, **kwargs)
        self.main_url = "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/"
        self.sub_urls = [
            "http://flask.pocoo.org/docs/0.10/deploying/wsgi-standalone/"
        ]

        self.urls = self.sub_urls + [self.main_url]
        self.crawler = PageCrawler()
        self.extractor = DragnetPageExtractor()
        self.content_getter = ContentGetter(self.crawler, self.extractor)
        self.es_client = Elasticsearch()

    def test_crawler(self):
        result = self.crawler.process(self.urls)
        pprint(result)

    def test_extractor(self):
        pprint(self.extractor.process(self.crawler.process(self.urls)))

    def test_all_text_extractor(self):
        self.extractor = AllTextPageExtractor()
        pprint(
            self.extractor.process(
                self.crawler.process([
                    'https://www.uncommongoods.com/gifts/personalized/personalized-gifts'
                ])))

    def test_all_text_extractor2(self):
        self.extractor = AllTextPageExtractor()
        pprint(
            self.extractor.process(
                self.crawler.process([
                    'https://vnexpress.net/tin-tuc/thoi-su/8-nguoi-chet-hon-tram-nghin-ngoi-nha-bi-toc-mai-do-bao-doksuri-3642317.html'
                ])))

    def test_get_text_from_url(self):
        urls = [
            'https://www.uncommongoods.com/gifts/personalized/personalized-gifts',
            'https://stackoverflow.com/questions/1521592/get-root-domain-of-link',
            'https://docs.python.org/2/library/urlparse.html'
        ]

        for url in urls:
            print get_text_from_url(url)

    def test_content_getter(self):
        result = self.content_getter.process(self.urls)
        pprint(result)

    def test_cosine_similarity(self):
        similarity = CosineSimilarity(self.content_getter, self.es_client)
        result = similarity.process(self.main_url, self.sub_urls)
        pprint(result)

    def _call_api(self, i):
        params = {
            'distance_metric': 'cosine',
            'main_url': self.main_url,
            'sub_urls': ', '.join(self.sub_urls)
        }
        response = requests.post(
            'http://107.170.109.238:8888/similarity/check', data=params)
        print i

    def test_api(self):
        params = {
            'distance_metric': 'cosine',
            'main_url': self.main_url,
            'sub_urls': ', '.join(self.sub_urls)
        }
        pool = Pool(4)
        pool.map(self._call_api, range(2000))

    def test_similarity_function(self):
        from similarity_checker import cosine_similarity, jaccard_similarity, fuzzy_similarity, simhash_similarity
        tokens_1 = 'This is a foo ba'.split()
        tokens_2 = 'This sentence is similar to a foo bar sentence'.split()
        pprint('jaccard: %s' % jaccard_similarity(tokens_1, tokens_2))
        pprint('cosine: %s' % cosine_similarity(tokens_1, tokens_2))
        pprint('fuzzy: %s' % fuzzy_similarity(tokens_1, tokens_2))
        pprint('simhash: %s' % simhash_similarity(tokens_1, tokens_2))

    def test_tokenizer(self):
        from similarity_checker import tokenize_and_normalize_content
        url = 'https://www.travelocity.com/Las-Vegas-Hotels-MGM-Grand-Hotel-Casino.h12628.Hotel-Information'
        page = self.content_getter.process([url])
        pprint(tokenize_and_normalize_content(page[url]['content']))

    def test_tokenize_and_normalize(self):
        from similarity_checker import tokenize_and_normalize_content
        text = 'what are you doing'
        pprint(
            tokenize_and_normalize_content(text,
                                           unit='character',
                                           min_ngram=1,
                                           max_ngram=3))
Beispiel #5
0
    def post(self):
        """Post web pages to extract content"""
        result = {'error': False, 'pages': []}
        unit = request.values.get('unit', 'word')
        min_ngram = int(request.values.get('min_ngram', 1))
        max_ngram = int(request.values.get('max_ngram', 1))
        urls = request.values.get('urls', '')
        strip_chars = ' "\''
        urls = [
            u.strip(strip_chars) for u in urls.split(',')
            if u.strip(strip_chars)
        ]
        if not urls:
            result['error'] = 'urls must not be empty'

        extractor_name = request.values.get('extractor', list_extractor[0])
        s_extractor = get_extractor(extractor_name)
        if not extractor:
            result[
                'error'] = "The extractor name '%s' does not support yet" % extractor_name
            return result
        if extractor_name == 'selective':
            s_extractor.selector_type = request.values.get(
                'selector_type', list_extractor[0])
            selector = request.values.get('selector')
            if not selector or not selector.strip():
                result[
                    'error'] = "You must specify the 'selector' element when the 'extractor' is 'selective'"
                return result
            s_extractor.selector = selector.strip()

        user_agent = request.values.get('user_agent', user_agents[0])
        page_load_timeout = request.values.get('page_load_timeout',
                                               page_load_timeout_default)
        wait_after_last_request = request.values.get(
            'wait_after_last_request', wait_after_last_request_default)
        s_crawler = PageCrawler(
            user_agent=user_agent.strip(),
            page_load_timeout=page_load_timeout,
            wait_after_last_request=wait_after_last_request)

        cache = int(request.values.get('cache', 0))
        if cache != 0:
            expire_time = int(request.values.get('expire_time',
                                                 604800))  # Seconds = 7 days
            s_crawler.active_redis_cache(expire_time)

        s_content_getter = ContentGetter(crawler=s_crawler,
                                         extractor=s_extractor)

        if not result['error']:
            pages = result['pages']
            for url, page in s_content_getter.process(urls).items():
                page['tokens'] = tokenize_and_normalize_content(
                    page['content'],
                    unit=unit,
                    min_ngram=min_ngram,
                    max_ngram=max_ngram)
                pages.append((url, page))

        return jsonify(result)