コード例 #1
0
ファイル: scraper.py プロジェクト: johnb30/scraper
def parse_results(rss_results, website, lang, db_collection):
    """
    Function to parse the links drawn from an RSS feed.

    Parameters
    ----------

    rss_results: pattern.web.Results.
                    Object containing data on the parsed RSS feed. Each item
                    represents a unique entry in the RSS feed and contains
                    relevant information such as the URL and title of the
                    story.

    website: String.
                Nickname for the RSS feed being scraped.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    if lang == 'english':
        goose_extractor = Goose({'use_meta_language': False,
                                 'target_language': 'en',
                                 'enable_image_fetching': False})
    elif lang == 'arabic':
        from goose.text import StopWordsArabic
        goose_extractor = Goose({'stopwords_class': StopWordsArabic,
                                 'enable_image_fetching': False})
    else:
        print(lang)

    for result in rss_results:

        page_url = _convert_url(result.url, website)

        in_database = _check_mongo(page_url, db_collection)

        if not in_database:
            try:
                text, meta = pages_scrape.scrape(page_url, goose_extractor)
                text = text.encode('utf-8')
            except TypeError:
                logger.warning('Problem obtaining text from URL: {}'.format(page_url))
                text = ''
        else:
            logger.debug('Result from {} already in database'.format(page_url))
            text = ''

        if text:
            cleaned_text = _clean_text(text, website)

            entry_id = mongo_connection.add_entry(db_collection, cleaned_text,
                                                  result.title, result.url,
                                                  result.date, website, lang)
            if entry_id:
                try:
                    logger.info('Added entry from {} with id {}'.format(page_url,
                                                                        entry_id))
                except UnicodeDecodeError:
                    logger.info('Added entry from {}. Unicode error for id'.format(result.url))
コード例 #2
0
ファイル: scraper.py プロジェクト: aserlich/scraper
def _parse_results(rss_results, website, db_collection):
    """
    Private function to parse the links drawn from an RSS feed.

    Parameters
    ------
    rss_results: pattern.web.Results.
                    Object containing data on the parsed RSS feed. Each item
                    represents a unique entry in the RSS feed and contains
                    relevant information such as the URL and title of the
                    story.

    website: String.
                Nickname for the RSS feed being scraped.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    goose_extractor = Goose({
        'use_meta_language': False,
        'target_language': 'en'
    })

    for result in rss_results:
        if website == 'xinhua':
            page_url = result.url.replace('"', '')
            page_url = page_url.encode('ascii')
        elif website == 'upi':
            page_url = result.url.encode('ascii')
        else:
            page_url = result.url.encode('utf-8')

        try:
            text, meta = pages_scrape.scrape(page_url, goose_extractor)
            text = text.encode('utf-8')
        except TypeError:
            logger.warning(
                'Problem obtaining text from URL: {}'.format(page_url))
            text = ''

        if text:
            cleaned_text = _clean_text(text, website)

            entry_id = mongo_connection.add_entry(db_collection, cleaned_text,
                                                  result.title, result.url,
                                                  result.date, website)
            if entry_id:
                try:
                    logger.info('Added entry from {} with id {}'.format(
                        page_url, entry_id))
                except UnicodeDecodeError:
                    logger.info(
                        'Added entry from {}. Unicode error for id'.format(
                            result.url))
            else:
                logger.info(
                    'Result from {} already in database'.format(page_url))
コード例 #3
0
ファイル: scraper.py プロジェクト: aserlich/scraper
def _parse_results(rss_results, website, db_collection):
    """
    Private function to parse the links drawn from an RSS feed.

    Parameters
    ------
    rss_results: pattern.web.Results.
                    Object containing data on the parsed RSS feed. Each item
                    represents a unique entry in the RSS feed and contains
                    relevant information such as the URL and title of the
                    story.

    website: String.
                Nickname for the RSS feed being scraped.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    goose_extractor = Goose({'use_meta_language': False,
                             'target_language': 'en'})

    for result in rss_results:
        if website == 'xinhua':
            page_url = result.url.replace('"', '')
            page_url = page_url.encode('ascii')
        elif website == 'upi':
            page_url = result.url.encode('ascii')
        else:
            page_url = result.url.encode('utf-8')

        try:
            text, meta = pages_scrape.scrape(page_url, goose_extractor)
            text = text.encode('utf-8')
        except TypeError:
            logger.warning('Problem obtaining text from URL: {}'.format(page_url))
            text = ''

        if text:
            cleaned_text = _clean_text(text, website)

            entry_id = mongo_connection.add_entry(db_collection, cleaned_text,
                                                  result.title, result.url,
                                                  result.date, website)
            if entry_id:
                try:
                    logger.info('Added entry from {} with id {}'.format(page_url,
                                                                        entry_id))
                except UnicodeDecodeError:
                    logger.info('Added entry from {}. Unicode error for id'.format(result.url))
            else:
                logger.info('Result from {} already in database'.format(page_url))
コード例 #4
0
ファイル: scraper.py プロジェクト: gaeandy/scraper
            if text:
                if website == 'bbc':
                    text = text.replace(
                        "This page is best viewed in an up-to-date web browser with style sheets (CSS) enabled. While you will be able to view the content of this page in your current browser, you will not be able to get the full visual experience. Please consider upgrading your browser software or enabling style sheets (CSS) if you are able to do so.",
                        '')
                if website == 'almonitor':
                    text = re.sub("^.*?\(photo by REUTERS.*?\)", "", text)
                if website == 'menafn_algeria' or website == 'menafn_bahrain' or website == 'menafn_egypt' or website == 'menafn_iraq' or website == 'menafn_jordan' or website == 'menafn_kuwait' or website == 'menafn_lebanon' or website == 'menafn_morocco' or website == 'menafn_oman' or website == 'menafn_palestine' or website == 'menafn_qatar' or website == 'menafn_saudi' or website == 'menafn_syria' or website == 'menafn_tunisia' or website == 'menafn_turkey' or website == 'menafn_uae' or website == 'menafn_yemen':
                    text = re.sub("^\(.*?MENAFN.*?\)", "", text)
                elif website == 'upi':
                    text = text.replace(
                        "Since 1907, United Press International (UPI) has been a leading provider of critical information to media outlets, businesses, governments and researchers worldwide. UPI is a global operation with offices in Beirut, Hong Kong, London, Santiago, Seoul and Tokyo. Our headquarters is located in downtown Washington, DC, surrounded by major international policy-making governmental and non-governmental organizations. UPI licenses content directly to print outlets, online media and institutions of all types. In addition, UPI's distribution partners provide our content to thousands of businesses, policy groups and academic institutions worldwide. Our audience consists of millions of decision-makers who depend on UPI's insightful and analytical stories to make better business or policy decisions. In the year of our 107th anniversary, our company strives to continue being a leading and trusted source for news, analysis and insight for readers around the world.",
                        '')

                entry_id = mongo_connection.add_entry(collection, text,
                                                      result.title, result.url,
                                                      result.date, website)
                if entry_id:
                    logger.info('Added entry from {} with id {}'.format(
                        result.url, entry_id))
                else:
                    logger.info('Result from {} already in database'.format(
                        result.url, entry_id))
    logger.info('Scrape of {} finished'.format(website))


def call_scrape_func(siteList, db_collection):
    """
    Helper function to iterate over a list of RSS feeds and scrape each.

    Parameters
コード例 #5
0
ファイル: scraper.py プロジェクト: manishagalla/BigData
def parse_results(rss_results, website, lang, db_collection):
    """
    Function to parse the links drawn from an RSS feed.
    Parameters
    ----------
    rss_results: pattern.web.Results.
                    Object containing data on the parsed RSS feed. Each item
                    represents a unique entry in the RSS feed and contains
                    relevant information such as the URL and title of the
                    story.
    website: String.
                Nickname for the RSS feed being scraped.
    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    if lang == 'english':
        goose_extractor = Goose({
            'use_meta_language': False,
            'target_language': 'en',
            'enable_image_fetching': False
        })
    elif lang == 'arabic':
        from goose.text import StopWordsArabic

        goose_extractor = Goose({
            'stopwords_class': StopWordsArabic,
            'enable_image_fetching': False
        })
    else:
        print(lang)

    for result in rss_results:

        page_url = _convert_url(result.url, website)

        in_database = _check_mongo(page_url, db_collection)

        if not in_database:
            try:
                text, meta = pages_scrape.scrape(page_url, goose_extractor)
                text = text.encode('utf-8')
            except TypeError:
                logger.warning(
                    'Problem obtaining text from URL: {}'.format(page_url))
                text = ''
        else:
            logger.debug('Result from {} already in database'.format(page_url))
            text = ''

        if text:
            cleaned_text = _clean_text(text, website)
            if (is_atrocity(cleaned_text)):
                print ""
                print "--------------------------------------------------------"
                print cleaned_text
                status(cleaned_text)
                entry_id = mongo_connection.add_entry(db_collection,
                                                      cleaned_text,
                                                      result.title, result.url,
                                                      result.date, website,
                                                      lang)
                if entry_id:
                    try:
                        logger.info('Added entry from {} with id {}'.format(
                            page_url, entry_id))
                    except UnicodeDecodeError:
                        logger.info(
                            'Added entry from {}. Unicode error for id'.format(
                                result.url))
コード例 #6
0
def scrape_func(address, website):
    """
    Function to scrape various RSS feeds. Uses the 'keep' and 'ignore'
    iterables to define which words should be used in the text search.

    Inputs
    ------
    address : address for the RSS feed to scrape. String.

    website : name of the website to scrape to be used in the filepath for the
    output. String.

    database : name of the MongoDB database that contains the collections.
    String? pymongo connection object?
    """
    connection = MongoClient()
    db = connection.atrocities_data
    collection = db[website]

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    corenlp_dir = 'stanford-corenlp/'
    corenlp_parse = StanfordCoreNLP(corenlp_dir)

    log = open('log_file.txt', 'a')
    results = pattern.web.Newsfeed().search(address, count=100, cached=False)
    log1 = 'There are %d results from %s \n' % (len(results), website)
    log.write(log1)
    for result in results:
        if website == 'nyt':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'bbc':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'reuters':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'ap':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'upi':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'xinhua':
            page_url = result.url.encode('ascii')
            page_url = page_url.replace('"', '')
            text = pages_scrape.scrape(page_url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'google':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
    interupt = '+' * 70
    log3 = '%s\nScrape %s once at %s!\n%s\n' % (interupt, website,
                                                datetime.datetime.now(),
                                                interupt)
    log.write(log3)
    log.close()
コード例 #7
0
ファイル: scraper.py プロジェクト: gaeandy/scraper
            except TypeError:
                logger.warning('Problem obtaining text from URL: {}'.format(page_url))
                text = ''

            if text:
                if website == 'bbc':
                    text = text.replace("This page is best viewed in an up-to-date web browser with style sheets (CSS) enabled. While you will be able to view the content of this page in your current browser, you will not be able to get the full visual experience. Please consider upgrading your browser software or enabling style sheets (CSS) if you are able to do so.", '')
                if website == 'almonitor':
                    text = re.sub("^.*?\(photo by REUTERS.*?\)", "", text)
                if website == 'menafn_algeria' or website == 'menafn_bahrain' or website == 'menafn_egypt' or website == 'menafn_iraq' or website == 'menafn_jordan' or website == 'menafn_kuwait' or website == 'menafn_lebanon' or website == 'menafn_morocco' or website == 'menafn_oman' or website == 'menafn_palestine' or website == 'menafn_qatar' or website == 'menafn_saudi' or website == 'menafn_syria' or website == 'menafn_tunisia' or website == 'menafn_turkey' or website == 'menafn_uae' or website == 'menafn_yemen':
                    text = re.sub("^\(.*?MENAFN.*?\)", "", text)
                elif website == 'upi':
                    text = text.replace("Since 1907, United Press International (UPI) has been a leading provider of critical information to media outlets, businesses, governments and researchers worldwide. UPI is a global operation with offices in Beirut, Hong Kong, London, Santiago, Seoul and Tokyo. Our headquarters is located in downtown Washington, DC, surrounded by major international policy-making governmental and non-governmental organizations. UPI licenses content directly to print outlets, online media and institutions of all types. In addition, UPI's distribution partners provide our content to thousands of businesses, policy groups and academic institutions worldwide. Our audience consists of millions of decision-makers who depend on UPI's insightful and analytical stories to make better business or policy decisions. In the year of our 107th anniversary, our company strives to continue being a leading and trusted source for news, analysis and insight for readers around the world.", '')

                entry_id = mongo_connection.add_entry(collection, text,
                                                      result.title, result.url,
                                                      result.date, website)
                if entry_id:
                    logger.info('Added entry from {} with id {}'.format(result.url,
                                                                        entry_id))
                else:
                    logger.info('Result from {} already in database'.format(result.url,
                                                                            entry_id))
    logger.info('Scrape of {} finished'.format(website))


def call_scrape_func(siteList, db_collection):
    """
    Helper function to iterate over a list of RSS feeds and scrape each.

    Parameters
コード例 #8
0
def parse_results(rss_results, website, lang, db_collection):

    #global kafka, producer
    #global kafkaPool
    #print website
    """
    Function to parse the links drawn from an RSS feed.

    Parameters
    ----------

    rss_results: pattern.web.Results.
                    Object containing data on the parsed RSS feed. Each item
                    represents a unique entry in the RSS feed and contains
                    relevant information such as the URL and title of the
                    story.

    website: String.
                Nickname for the RSS feed being scraped.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    #===============================================================================
    #     if lang == 'english':
    #         goose_extractor = Goose({'use_meta_language': False,
    #                                  'target_language': 'en',
    #                                  'enable_image_fetching': False})
    #     elif lang == 'arabic':
    #         from goose.text import StopWordsArabic
    #
    #         goose_extractor = Goose({'stopwords_class': StopWordsArabic,
    #                                  'enable_image_fetching': False})
    #     else:
    #         print(lang)
    #===============================================================================

    langCode = languageMap.get(lang)
    if langCode is not None:
        newspaper_extractor = NewspaperTextExtractor(language=langCode)
    else:
        print "ERROR: Extractor for", lang, "is not available"

    #producer = KafkaProducer(bootstrap_servers='dmlhdpc1')
    client = SimpleClient('172.29.100.6:9092')
    producer = SimpleProducer(client)

    #print "Parsing Results"
    for result in rss_results:

        page_url = _convert_url(result.url, website)
        print page_url
        in_database = _check_mongo(page_url, db_collection)

        if not in_database:
            try:
                text, meta = pages_scrape.scrape(page_url, newspaper_extractor)
                text = text.encode('utf-8')
                #print meta
            except TypeError:
                logger.warning(
                    'Problem obtaining text from URL: {}'.format(page_url))
                text = ''
            except UnicodeDecodeError:
                logger.warning(
                    'Unicode Decoding Issue in URL: {}'.format(page_url))
                text = ''
        else:
            logger.debug('Result from {} already in database'.format(page_url))
            text = ''

        if text:
            #print "Adding Document"
            cleaned_text = _clean_text(text, website)

            entry_id = mongo_connection.add_entry(db_collection, cleaned_text,
                                                  result.title, result.url,
                                                  result.date, website, lang)

            jsonData = {
                "url": result.url,
                "title": result.title,
                "source": website,
                "date": result.date,
                "date_added": datetime.datetime.utcnow(),
                "content": cleaned_text,
                "stanford": 0,
                "language": lang,
                "processed": "False",
                "mongo_id": str(entry_id),
                "type": "story"
            }
            doc = convert_to_SGML(jsonData)
            #             print "#######TEST"
            #             located_keys = locateKeyWords(cleaned_text, keywords)
            #             print located_keys
            #             located_discards = locateKeyWords(cleaned_text, discard_list)
            #             print located_discards
            # #             kafkaMessenger = kafkaPool.take()
            #             print "TAKEN"
            #             kafkaMessenger.send(doc.encode('utf-8'))
            #             kafkaPool.give_back(kafkaMessenger)
            producer.send_messages("test",
                                   str(entry_id) + "#" + doc.encode('utf-8'))
            #print "Message Sent"
            #producer.send_messages('test', doc.encode("utf-8"))

            #             #====================
            #             doc_file = open("Output/" + str(entry_id) + ".txt", "w")
            #             doc_file.write(doc.encode("utf-8"))
            #             doc_file.flush()
            #             doc_file.close()
            #             #====================`
            if entry_id:
                try:
                    logger.info('Added entry from {} with id {}'.format(
                        page_url, entry_id))
                except UnicodeDecodeError:
                    logger.info(
                        'Added entry from {}. Unicode error for id'.format(
                            result.url))

    client.close()