Example #1
0
def parse_results(rss_results, website, lang, db_collection):
    """
    Function to parse the links drawn from an RSS feed.

    Parameters
    ----------

    rss_results: pattern.web.Results.
                    Object containing data on the parsed RSS feed. Each item
                    represents a unique entry in the RSS feed and contains
                    relevant information such as the URL and title of the
                    story.

    website: String.
                Nickname for the RSS feed being scraped.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    if lang == 'english':
        goose_extractor = Goose({'use_meta_language': False,
                                 'target_language': 'en',
                                 'enable_image_fetching': False})
    elif lang == 'arabic':
        from goose.text import StopWordsArabic
        goose_extractor = Goose({'stopwords_class': StopWordsArabic,
                                 'enable_image_fetching': False})
    else:
        print(lang)

    for result in rss_results:

        page_url = _convert_url(result.url, website)

        in_database = _check_mongo(page_url, db_collection)

        if not in_database:
            try:
                text, meta = pages_scrape.scrape(page_url, goose_extractor)
                text = text.encode('utf-8')
            except TypeError:
                logger.warning('Problem obtaining text from URL: {}'.format(page_url))
                text = ''
        else:
            logger.debug('Result from {} already in database'.format(page_url))
            text = ''

        if text:
            cleaned_text = _clean_text(text, website)

            entry_id = mongo_connection.add_entry(db_collection, cleaned_text,
                                                  result.title, result.url,
                                                  result.date, website, lang)
            if entry_id:
                try:
                    logger.info('Added entry from {} with id {}'.format(page_url,
                                                                        entry_id))
                except UnicodeDecodeError:
                    logger.info('Added entry from {}. Unicode error for id'.format(result.url))
Example #2
0
def _parse_results(rss_results, website, db_collection):
    """
    Private function to parse the links drawn from an RSS feed.

    Parameters
    ------
    rss_results: pattern.web.Results.
                    Object containing data on the parsed RSS feed. Each item
                    represents a unique entry in the RSS feed and contains
                    relevant information such as the URL and title of the
                    story.

    website: String.
                Nickname for the RSS feed being scraped.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    goose_extractor = Goose({'use_meta_language': False,
                             'target_language': 'en'})

    for result in rss_results:
        if website == 'xinhua':
            page_url = result.url.replace('"', '')
            page_url = page_url.encode('ascii')
        elif website == 'upi':
            page_url = result.url.encode('ascii')
        else:
            page_url = result.url.encode('utf-8')

        try:
            text, meta = pages_scrape.scrape(page_url, goose_extractor)
            text = text.encode('utf-8')
        except TypeError:
            logger.warning('Problem obtaining text from URL: {}'.format(page_url))
            text = ''

        if text:
            cleaned_text = _clean_text(text, website)

            entry_id = mongo_connection.add_entry(db_collection, cleaned_text,
                                                  result.title, result.url,
                                                  result.date, website)
            if entry_id:
                try:
                    logger.info('Added entry from {} with id {}'.format(page_url,
                                                                        entry_id))
                except UnicodeDecodeError:
                    logger.info('Added entry from {}. Unicode error for id'.format(result.url))
            else:
                logger.info('Result from {} already in database'.format(page_url))
Example #3
0
def scrape_func(address, website):
    """
    Function to scrape various RSS feeds. Uses the 'keep' and 'ignore'
    iterables to define which words should be used in the text search.

    Inputs
    ------
    address : address for the RSS feed to scrape. String.

    website : name of the website to scrape to be used in the filepath for the
    output. String.

    database : name of the MongoDB database that contains the collections.
    String? pymongo connection object?
    """
    connection = MongoClient()
    db = connection.atrocities_data
    collection = db[website]

    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    corenlp_dir = 'stanford-corenlp/'
    corenlp_parse = StanfordCoreNLP(corenlp_dir)

    log = open('log_file.txt', 'a')
    results = pattern.web.Newsfeed().search(address, count=100, cached=False)
    log1 = 'There are %d results from %s \n' % (len(results), website)
    log.write(log1)
    for result in results:
        if website == 'nyt':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'bbc':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'reuters':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'ap':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'upi':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'xinhua':
            page_url = result.url.encode('ascii')
            page_url = page_url.replace('"', '')
            text = pages_scrape.scrape(page_url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
        if website == 'google':
            text = pages_scrape.scrape(result.url, result.title)
            head_sentences = sent_detector.tokenize(text.strip())[:4]
            joined_sentences = ' '.join(head_sentences)
            parsed = corenlp_parse.raw_parse(joined_sentences)
            entry_id = mongo_connection.add_entry(collection, text, parsed,
                                                    result.title, result.url,
                                                    result.date, website)
            if entry_id:
                log2 = 'Added entry from %s with id %s \n' % (result.url,
                                                                str(entry_id)
                                                                )
                log.write(log2)
            else:
                log2 = 'Result from %s already in database \n' % (result.url)
                log.write(log2)
    interupt = '+' * 70
    log3 = '%s\nScrape %s once at %s!\n%s\n' % (interupt, website,
                                                datetime.datetime.now(),
                                                interupt)
    log.write(log3)
    log.close()
Example #4
0
            except TypeError:
                logger.warning('Problem obtaining text from URL: {}'.format(page_url))
                text = ''

            if text:
                if website == 'bbc':
                    text = text.replace("This page is best viewed in an up-to-date web browser with style sheets (CSS) enabled. While you will be able to view the content of this page in your current browser, you will not be able to get the full visual experience. Please consider upgrading your browser software or enabling style sheets (CSS) if you are able to do so.", '')
                if website == 'almonitor':
                    text = re.sub("^.*?\(photo by REUTERS.*?\)", "", text)
                if website == 'menafn_algeria' or website == 'menafn_bahrain' or website == 'menafn_egypt' or website == 'menafn_iraq' or website == 'menafn_jordan' or website == 'menafn_kuwait' or website == 'menafn_lebanon' or website == 'menafn_morocco' or website == 'menafn_oman' or website == 'menafn_palestine' or website == 'menafn_qatar' or website == 'menafn_saudi' or website == 'menafn_syria' or website == 'menafn_tunisia' or website == 'menafn_turkey' or website == 'menafn_uae' or website == 'menafn_yemen':
                    text = re.sub("^\(.*?MENAFN.*?\)", "", text)
                elif website == 'upi':
                    text = text.replace("Since 1907, United Press International (UPI) has been a leading provider of critical information to media outlets, businesses, governments and researchers worldwide. UPI is a global operation with offices in Beirut, Hong Kong, London, Santiago, Seoul and Tokyo. Our headquarters is located in downtown Washington, DC, surrounded by major international policy-making governmental and non-governmental organizations. UPI licenses content directly to print outlets, online media and institutions of all types. In addition, UPI's distribution partners provide our content to thousands of businesses, policy groups and academic institutions worldwide. Our audience consists of millions of decision-makers who depend on UPI's insightful and analytical stories to make better business or policy decisions. In the year of our 107th anniversary, our company strives to continue being a leading and trusted source for news, analysis and insight for readers around the world.", '')

                entry_id = mongo_connection.add_entry(collection, text,
                                                      result.title, result.url,
                                                      result.date, website)
                if entry_id:
                    logger.info('Added entry from {} with id {}'.format(result.url,
                                                                        entry_id))
                else:
                    logger.info('Result from {} already in database'.format(result.url,
                                                                            entry_id))
    logger.info('Scrape of {} finished'.format(website))


def call_scrape_func(siteList, db_collection):
    """
    Helper function to iterate over a list of RSS feeds and scrape each.

    Parameters