예제 #1
0
    def getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)):
        attributes = []
        class_vars = []
        metas = [(data.StringVariable('Message'), lambda doc: doc['message']),
                 (data.DiscreteVariable('Type'), lambda doc: doc['type']),
                 (data.StringVariable('Post ID'), lambda doc: doc['post_id']),
                 (data.StringVariable('Comment ID'),
                  lambda doc: doc['comment_id']),
                 (data.StringVariable('Parent comment ID'),
                  lambda doc: doc['parent_comment_id']),
                 (data.ContinuousVariable('likes'), lambda doc: doc['likes']),
                 (data.ContinuousVariable('comment replies'),
                  lambda doc: doc['comment_replies']),
                 (data.TimeVariable('Publication Date'),
                  lambda doc: doc['status_published']),
                 (data.TimeVariable('Publication Date UTC'),
                  lambda doc: doc['status_published_utc'])]
        text_features = [metas[0][0]]
        title_indices = [-1]

        results = []
        for doc in self._getComments(post_ids, comment_replies, sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook comments', attributes,
                                  class_vars, metas, title_indices)
        c.set_text_features(text_features)
        return c
예제 #2
0
파일: nyt.py 프로젝트: seyyaw/orange3-text
class NYT:
    """ Class for fetching records from the NYT API. """

    @staticmethod
    def keywords(doc, name):
        return ', '.join([kw.get('value')
                          for kw in doc.get('keywords', [])
                          if kw['name'] == name])

    attributes = []

    class_vars = [
        (data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)),
    ]

    tv = data.TimeVariable('Publication Date')
    metas = [
        (data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''),
        (data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''),
        (data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''),
        (data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''),
        (data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')),
        (data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''),
        (data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')),
        (data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')),
        (data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')),
        (data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')),
        (tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))),
        (data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)),
        (data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc.get('word_count', None)),
    ]

    text_features = [metas[0][0], metas[1][0]]  # headline + abstract

    def __init__(self, api_key):
        """
        Args:
            api_key (str): NY Time API key.
        """
        self.api_key = api_key
        self.on_error = None
        self.on_rate_limit = None
        self.on_no_connection = None
        self.cache_path = None
        self._cache_init()

    def api_key_valid(self):
        """ Checks whether api key given at initialization is valid. """
        url = self._encode_url('test')
        try:
            with request.urlopen(url) as connection:
                if connection.getcode() == 200:
                    return True
        except (HTTPError, URLError, HTTPException):
            return False

    def search(self, query, date_from=None, date_to=None, max_docs=None,
               on_progress=None, should_break=None):
        """
        Args:
            query (str): Search query.
            date_from (date): Start date limit.
            date_to (date): End date limit.
            max_docs (int): Maximal number of documents returned.
            on_progress (callback): Called after every iteration of downloading.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.

        Returns:
            Corpus: Search results.
        """
        if max_docs is None or max_docs > MAX_DOCS:
            max_docs = MAX_DOCS

        # TODO create corpus on the fly and extend, so it stops faster.
        records = []
        data, go_sleep = self._fetch_page(query, date_from, date_to, 0)
        if data is None:
            return None
        records.extend(data['response']['docs'])
        max_docs = min(data['response']['meta']['hits'], max_docs)
        if callable(on_progress):
            on_progress(len(records), max_docs)

        for page in range(1, math.ceil(max_docs/BATCH_SIZE)):
            if callable(should_break) and should_break():
                break

            if go_sleep:
                sleep(SLEEP)

            data, go_sleep = self._fetch_page(query, date_from, date_to, page)

            if data is None:
                break

            records.extend(data['response']['docs'])
            if callable(on_progress):
                on_progress(len(records), max_docs)

        if len(records) > max_docs:
            records = records[:max_docs]

        return Corpus.from_documents(records, 'NY Times', self.attributes,
                                     self.class_vars, self.metas, title_indices=[-1])

    def _cache_init(self):
        """ Initialize cache in Orange environment buffer dir. """
        path = os.path.join(environ.cache_dir(), "nytcache")
        try:
            if not os.path.exists(path):
                os.makedirs(path)
            self.cache_path = os.path.join(path, "query_cache")
        except OSError as e:
            warnings.warn('Could not initialize NYT cache: {}'.format(str(e)), RuntimeWarning)

    def _cache_fetch(self, url):
        """ Fetch URL from cache if present. """
        with shelve.open(self.cache_path) as cache:
            if url in cache.keys():
                return cache[url]
            else:
                return None

    def _cache_store(self, url, data):
        """ Store data for URL in cache. """
        with shelve.open(self.cache_path) as cache:
            cache[url] = data

    def _fetch_page(self, query, date_from, date_to, page):
        """ Fetch one page either from cache or web. """
        cache_url = self._encode_url(query, date_from, date_to, page, for_caching=True)
        data = self._cache_fetch(cache_url)
        if data:
            return data, False
        else:
            url = self._encode_url(query, date_from, date_to, page, for_caching=False)
            try:
                with request.urlopen(url, timeout=TIMEOUT) as conn:
                    data = conn.read().decode('utf-8')
            except HTTPError as e:
                if e.code == 403 and page > 0:
                    # occasionally some pages return error 403 (Forbidden)
                    # while all other page numbers seem to work just fine.
                    # Skip such pages and don't break loading!
                    warnings.warn('NYT api returned HTTPError with code 403 '
                                  '(Forbidden)! Skipping this page ...')
                    return {'response': {'docs': []}}, True
                if e.code == 429 and callable(self.on_rate_limit):
                    self.on_rate_limit()
                elif callable(self.on_error):
                    self.on_error(str(e))
                return None, False
            except URLError:
                if callable(self.on_no_connection):
                    self.on_no_connection()
                    return None, False
                raise
            data = json.loads(data)
            self._cache_store(cache_url, data)
            return data, True

    def _encode_url(self, query, date_from=None, date_to=None, page=0, for_caching=False):
        """
        Encode url for given query, date restrictions and page number.

        Args:
            query (str): Search query.
            date_from (date): Date restriction.
            date_to (date): Date restriction.
            page (int): Page number.
            for_caching (bool): Whether URL would be used for caching. If set, exclude BASE_URL
                and API key.

        Returns:
            str: An encoded URL.
        """
        params = [   # list required to preserve order - important for caching
            ('fq', 'The New York Times'),
            ('api-key', self.api_key),
            ('q', query),
            ('page', page),
        ]
        if date_from:
            params.append(('begin_date', date_from.strftime('%Y%m%d')))
        if date_to:
            params.append(('end_date', date_to.strftime('%Y%m%d')))

        if for_caching:     # remove api key, return only params
            del params[0]
            return parse.urlencode(params)
        else:
            return '{}?{}'.format(BASE_URL, parse.urlencode(params))
예제 #3
0
class TheGuardianAPI:
    attributes = []

    class_vars = [
        (data.DiscreteVariable('Section'), lambda doc: doc['sectionName']),
    ]

    tv = data.TimeVariable('Publication Date')
    metas = [
        (data.StringVariable('Headline'), lambda doc: doc['fields']['headline']),
        (data.StringVariable('Content'), lambda doc: doc['fields']['bodyText']),
        (data.StringVariable('Trail Text'), lambda doc: doc['fields']['trailText']),
        (data.StringVariable('HTML'), lambda doc: doc['fields']['body']),
        (tv, lambda doc: TheGuardianAPI.tv.parse(doc['webPublicationDate'])),
        (data.DiscreteVariable('Type'), lambda doc: doc['type']),
        (data.DiscreteVariable('Language'), lambda doc: doc['fields']['lang']),
        (data.StringVariable('Tags'),
            lambda doc: ', '.join(tag['webTitle'] for tag in doc['tags'])),
        (data.StringVariable('URL'), lambda doc: doc['webUrl']),
        (data.ContinuousVariable('Word Count', number_of_decimals=0),
            lambda doc: doc['fields']['wordcount']),
    ]

    text_features = [metas[0][0], metas[1][0]]  # Headline + Content
    title_indices = [-1]    # Headline

    def __init__(self, credentials, on_progress=None, should_break=None):
        """
        Args:
            credentials (:class:`TheGuardianCredentials`): The Guardian Creentials.
            on_progress (callable): Function for progress reporting.
            should_break (callable): Function for early stopping.
        """
        self.per_page = ARTICLES_PER_PAGE
        self.pages = 0
        self.credentials = credentials
        self.on_progress = on_progress or (lambda x, y: None)
        self.should_break = should_break or (lambda: False)

        self.results = []

    def _search(self, query, from_date, to_date, page=1):
        data = self._build_query(query, from_date, to_date, page)

        response = requests.get(BASE_URL, data)
        parsed = json.loads(response.text)

        if page == 1:   # store number of pages
            self.pages = parsed['response']['pages']

        self.results.extend(parsed['response']['results'])

    def _build_query(self, query, from_date=None, to_date=None, page=1):
        data = {
            'q': query,
            'api-key': self.credentials.key,
            'page': str(page),
            'show-fields': 'headline,trailText,body,bodyText,lang,wordcount',
            'show-tags': 'all',
        }
        if from_date is not None:
            data['from-date'] = from_date
        if to_date is not None:
            data['to-date'] = to_date

        return data

    def search(self, query, from_date=None, to_date=None, max_documents=None,
               accumulate=False):
        """
        Search The Guardian API for articles.

        Args:
            query (str): A query for searching the articles by
            from_date (str): Search only articles newer than the date provided.
                Date should be in ISO format; e.g. '2016-12-31'.
            to_date (str): Search only articles older than the date provided.
                Date should be in ISO format; e.g. '2016-12-31'.
            max_documents (int): Maximum number of documents to retrieve.
                When not given, retrieve all documents.
            accumulate (bool): A flag indicating whether to accumulate results
                of multiple consequent search calls.

        Returns:
            :ref:`Corpus`
        """
        if not accumulate:
            self.results = []

        self._search(query, from_date, to_date)

        pages = math.ceil(max_documents/self.per_page) if max_documents else self.pages
        self.on_progress(self.per_page, pages * self.per_page)

        for p in range(2, pages+1):     # to one based
            if self.should_break():
                break
            self._search(query, from_date, to_date, p)
            self.on_progress(p*self.per_page, pages * self.per_page)

        c = Corpus.from_documents(
            self.results, 'The Guardian', self.attributes, self.class_vars,
            self.metas, title_indices=self.title_indices)
        c.text_features = self.text_features
        return c
예제 #4
0
class TwitterAPI:
    """ Fetch tweets from the Tweeter API.

    Notes:
        Results across multiple searches are aggregated. To remove tweets form
        previous searches and only return results from the last search either
        call `reset` method before searching or provide `collecting=False`
        argument to search method.
    """
    attributes = []
    class_vars = [
        (data.DiscreteVariable('Author'), lambda doc: '@' + doc.author.screen_name),
    ]

    tv = data.TimeVariable('Date')
    metas = [
        (data.StringVariable('Content'), lambda doc: doc.text),
        (tv, lambda doc: TwitterAPI.tv.parse(doc.created_at.isoformat())),
        (data.DiscreteVariable('Language'), lambda doc: doc.lang),
        (data.DiscreteVariable('Location'), lambda doc: getattr(doc.place, 'country_code', None)),
        (data.ContinuousVariable('Number of Likes', number_of_decimals=0),
         lambda doc: doc.favorite_count),
        (data.ContinuousVariable('Number of Retweets', number_of_decimals=0),
         lambda doc: doc.retweet_count),
        (data.DiscreteVariable('In Reply To'),
            lambda doc: '@' + doc.in_reply_to_screen_name if doc.in_reply_to_screen_name else ''),
        (data.DiscreteVariable('Author Name'), lambda doc: doc.author.name),
        (data.StringVariable('Author Description'), lambda doc: doc.author.description),
        (data.ContinuousVariable('Author Statuses Count', number_of_decimals=0),
         lambda doc: doc.author.statuses_count),
        (data.ContinuousVariable('Author Favourites Count', number_of_decimals=0),
         lambda doc: doc.author.favourites_count),
        (data.ContinuousVariable('Author Friends Count', number_of_decimals=0),
         lambda doc: doc.author.friends_count),
        (data.ContinuousVariable('Author Followers Count', number_of_decimals=0),
         lambda doc: doc.author.followers_count),
        (data.ContinuousVariable('Author Listed Count', number_of_decimals=0),
         lambda doc: doc.author.listed_count),
        (data.DiscreteVariable('Author Verified'), lambda doc: str(doc.author.verified)),
        (data.ContinuousVariable('Longitude'),
            lambda doc: coordinates_geoJSON(doc.coordinates)[0]),
        (data.ContinuousVariable('Latitude'),
            lambda doc: coordinates_geoJSON(doc.coordinates)[1]),
    ]

    text_features = [metas[0][0]]       # Content
    string_attributes = [m for m, _ in metas
                         if isinstance(m, data.StringVariable)]

    def __init__(self, credentials,
                 on_progress=None, should_break=None,
                 on_error=None, on_rate_limit=None):
        self.key = credentials
        self.api = tweepy.API(credentials.auth)
        self.container = OrderedDict()
        self.search_history = []

        # Callbacks:
        self.on_error = on_error
        self.on_rate_limit = on_rate_limit
        self.on_progress = on_progress or (lambda *args: args)
        self.should_break = should_break or (lambda *args: False)

    @property
    def tweets(self):
        return self.container.values()

    def search_content(self, content, *, max_tweets=0,
                       lang=None, allow_retweets=True,
                       collecting=False):
        """ Search by content.

        Args:
            content (list of str): A list of key words to search for.
            max_tweets (int): If greater than zero limits the number of
                downloaded tweets.
            lang (str): A language's code (either ISO 639-1 or ISO 639-3
                formats).
            allow_retweets(bool): Whether to download retweets.
            collecting (bool): Whether to collect results across multiple
                search calls.

        Returns:
            Corpus
        """
        if not collecting:
            self.reset()

        if max_tweets == 0:
            max_tweets = float('Inf')

        def build_query():
            nonlocal content
            if not content:
                q = 'from: '
            else:
                if not isinstance(content, list):
                    content = [content]
                q = ' OR '.join(['"{}"'.format(q) for q in content])
            if not allow_retweets:
                q += ' -filter:retweets'
            return q

        query = build_query()
        cursor = tweepy.Cursor(self.api.search, q=query, lang=lang)
        corpus, count = self.fetch(cursor, max_tweets)
        self.append_history('Content', content, lang if lang else 'Any',
                            str(allow_retweets), count)
        return corpus

    def search_authors(self, authors, *, max_tweets=0, collecting=False):
        """ Search by authors.

        Args:
            authors (list of str): A list of authors to search for.
            max_tweets (int): If greater than zero limits the number of
                downloaded tweets.
            collecting (bool): Whether to collect results across multiple
                search calls.

        Returns:
            Corpus
        """
        if not collecting:
            self.reset()

        if max_tweets == 0:     # set to max allowed for progress
            max_tweets = 3200

        if not isinstance(authors, list):
            authors = [authors]

        cursors = [tweepy.Cursor(self.api.user_timeline, screen_name=a)
                   for a in authors]
        corpus, count = self.fetch(cursors, max_tweets)
        self.append_history('Author', authors, None, None, count)
        return corpus

    def fetch(self, cursors, max_tweets):
        if not isinstance(cursors, list):
            cursors = [cursors]

        count = 0
        try:
            for i, cursor in enumerate(cursors):
                for j, tweet in enumerate(cursor.items(max_tweets), start=1):
                    if self.should_break():
                        break
                    if tweet.id not in self.container:
                        count += 1
                    self.container[tweet.id] = tweet
                    if j % 20 == 0:
                        self.on_progress(len(self.container),
                                         (i*max_tweets + j)/
                                         (len(cursors)*max_tweets))
                if self.should_break():
                    break
        except tweepy.TweepError as e:
            if e.response.status_code == 429 and self.on_rate_limit:
                self.on_rate_limit()
            elif self.on_error:
                self.on_error(str(e))
                return None, 0
        return self.create_corpus(), count

    def create_corpus(self):
        return Corpus.from_documents(self.tweets, 'Twitter', self.attributes,
                                     self.class_vars, self.metas,
                                     title_indices=[-1])

    def reset(self):
        """ Removes all downloaded tweets. """
        self.search_history = []
        self.container = OrderedDict()

    def append_history(self, mode, query, lang, allow_retweets, n_tweets):
        query = ', '.join(query) if isinstance(query, Iterable) else query
        if lang in code2lang.keys():
            lang = code2lang[lang]
        self.search_history.append((
            ('Query', query),
            ('Search by', mode),
            ('Language', lang),
            ('Allow retweets', allow_retweets),
            ('Tweets count', n_tweets),
        ))

    def report(self):
        return self.search_history
예제 #5
0
class FacebookOrangeAPI():
    attributes = []
    class_vars = []
    image_var = data.StringVariable.make("image")
    image_var.attributes["type"] = "image"
    post_metas = [
        (data.StringVariable('Message'), lambda doc: doc['status_message']),
        (data.DiscreteVariable('From'), lambda doc: doc['from_name']),
        (data.ContinuousVariable('likes'), lambda doc: doc['like']),
        (data.ContinuousVariable('comments'), lambda doc: doc['comments']),
        (data.ContinuousVariable('shares'), lambda doc: doc['shares']),
        (data.DiscreteVariable('top emotion'),
         lambda doc: doc['top_reaction']),
        (data.StringVariable('Link name'), lambda doc: doc['link_name']),
        (image_var, lambda doc: doc['picture']),
        (data.StringVariable('link'), lambda doc: doc['status_link']),
        (data.DiscreteVariable('From ID'), lambda doc: doc['from_id']),
        (data.StringVariable('Post ID'), lambda doc: doc['status_id']),
        (data.DiscreteVariable('Post type'), lambda doc: doc['status_type']),
        (data.TimeVariable('Publication Date'),
         lambda doc: doc['status_published']),
        (data.TimeVariable('Publication Date UTC'),
         lambda doc: doc['status_published_utc']),
        (data.ContinuousVariable('emotion angry'), lambda doc: doc['angry']),
        (data.ContinuousVariable('emotion love'), lambda doc: doc['love']),
        (data.ContinuousVariable('emotion haha'), lambda doc: doc['haha']),
        (data.ContinuousVariable('emotion wow'), lambda doc: doc['wow']),
        (data.ContinuousVariable('emotion sad'), lambda doc: doc['sad'])
    ]
    text_features = [post_metas[0][0]]
    title_indices = [-1]

    def __init__(self, credentials, on_progress=None, should_break=None):
        self.utc_datecor = datetime.utcnow() - datetime.now()
        self.pages = 0
        self.credentials = credentials
        self.on_progress = on_progress or (lambda x, y: None)
        self.should_break = should_break or (lambda: False)

    def buildUrl(self, node, version='v2.11'):
        return BASE_URL + '/' + version + '/' + node

    def getData(self, url, params=None):
        while True:
            if self.should_break():
                return {}
            try:
                headers = {'Authorization': 'Bearer ' + self.credentials.token}
                p = requests.get(url, params=params, headers=headers)
                return p.json()
            except:
                print('retry in 5 sec')
                for i in range(50):
                    if self.should_break():
                        return {}
                    time.sleep(0.1)

    def localToUtc(self, date):
        return date + self.utc_datecor

    def utcToLocal(self, date):
        return date - self.utc_datecor

    def processDate(self, created_time):
        return datetime.strptime(created_time, '%Y-%m-%dT%H:%M:%S+0000')

    def processStatus(self, status, engagement=True):
        d = {}
        d['status_id'] = status['id']
        d['from_id'] = status['from']['id'] if 'from' in status.keys() else ''
        d['from_name'] = status['from']['name'] if 'from' in status.keys(
        ) else ''
        d['status_message'] = '' if 'message' not in status.keys(
        ) else status['message']
        d['status_type'] = status['type']
        d['link_name'] = '' if 'name' not in status.keys() else status['name']

        d['status_published_utc'] = self.processDate(status['created_time'])
        d['status_published'] = self.utcToLocal(d['status_published_utc'])
        d['status_link'] = '' if 'link' not in status.keys(
        ) else status['link']
        d['picture'] = status['full_picture'] if 'full_picture' in status.keys(
        ) else ''

        topscore = 0
        d['like'] = status['like']['summary'][
            'total_count'] if engagement else ''
        d['comments'] = status['comments']['summary'][
            'total_count'] if engagement else ''
        d['shares'] = status['shares']['count'] if 'shares' in status.keys(
        ) else ''

        d['top_reaction'] = ''
        for score in ['love', 'haha', 'wow', 'sad', 'angry']:
            d[score] = status[score]['summary'][
                'total_count'] if engagement else ''
            if engagement:
                d[score] = status[score]['summary']['total_count']
                if int(d[score]) > topscore:
                    topscore = int(d[score])
                    d['top_reaction'] = score
            else:
                d[score] = ''
                d['top_reaction'] = ''

        return d

    def fieldString(self, engagement=True):
        field_string = 'message,from,link,created_time,type,name,id,full_picture'

        if engagement:
            field_string += ',' + 'comments.limit(0).summary(true),shares.limit(0).summary(true)'
            for r in ['like', 'love', 'haha', 'wow', 'sad', 'angry']:
                field_string += ',' + 'reactions.type({}).limit(0).summary(true).as({})'.format(
                    r.upper(), r.lower())
        return field_string

    def getStatuses(self,
                    page_id,
                    mode='posts',
                    since=None,
                    until=None,
                    engagement=True,
                    comments=True):
        node = page_id + '/' + mode + '/'  ## mode can be "posts" (posts by page), "feed" (all posts on page) and "tagged" (all public posts in which page is tagged
        url = self.buildUrl(node)

        params = {}
        params['fields'] = self.fieldString(engagement)
        params['limit'] = 100

        if since is not None:
            params['since'] = (
                self.localToUtc(since)).strftime('%Y-%m-%dT%H:%M:%S')
        if until is not None:
            params['until'] = (
                self.localToUtc(until)).strftime('%Y-%m-%dT%H:%M:%S')
        while True:
            statuses = self.getData(url, params=params)
            if not 'data' in statuses: break

            proc_statuses = [
                self.processStatus(s, engagement) for s in statuses['data']
            ]
            yield proc_statuses

            if not 'paging' in statuses.keys(): break
            if not 'next' in statuses['paging'].keys(): break
            url = statuses['paging']['next']

    def _search(self,
                page_ids,
                mode,
                since,
                until,
                max_documents,
                sub_progress=(0, 1)):
        since = since.strftime('%Y-%m-%d')
        until = until.strftime('%Y-%m-%d')
        since = datetime.strptime(since, '%Y-%m-%d')
        until = datetime.strptime(until + 'T23:59:59', '%Y-%m-%dT%H:%M:%S')
        total_sec = float((until - since).total_seconds())
        n_pages = len(page_ids)

        progress_pct = 1 / float(n_pages)

        for page_i in range(0, n_pages):
            page_id = page_ids[page_i].strip()
            if page_id == '': return
            if '/' in page_id: page_id = page_id.split('/')[-1]
            page_progress = progress_pct * page_i
            n = 0
            for d in self.getStatuses(page_id, mode, since, until):
                if self.should_break():
                    return
                earliest_date = d[-1]['status_published']
                sec_to_go = (until - earliest_date).total_seconds()
                date_progress = ((sec_to_go / total_sec) * progress_pct)
                progress = math.ceil((page_progress + date_progress) * 100)
                self.on_progress(progress_scale(progress, sub_progress), 100)
                for doc in d:
                    n += 1
                    if max_documents is not None:
                        if n > max_documents:
                            break
                    yield doc
                if max_documents is not None:
                    if n > max_documents:
                        break
        self.on_progress(progress_scale(100, sub_progress), 100)

    def search(self,
               page_ids,
               mode='posts',
               since=datetime.now() - timedelta(10),
               until=datetime.now(),
               max_documents=None,
               sub_progress=(0, 1)):
        results = []
        for doc in self._search(page_ids, mode, since, until, max_documents,
                                sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook', self.attributes,
                                  self.class_vars, self.post_metas,
                                  self.title_indices)
        c.set_text_features(self.text_features)
        return c

    def _search_posts(self, post_ids, sub_progress=(0, 1), engagement=True):
        for i, post_id in enumerate(post_ids):
            node = post_id
            url = self.buildUrl(node)

            params = {}
            params['fields'] = self.fieldString(engagement)
            params['limit'] = 100

            status = self.getData(url, params=params)
            status = self.processStatus(status)
            yield status

            progress = ((i + 1) / len(post_ids)) * 100
            self.on_progress(progress_scale(progress, sub_progress), 100)
        self.on_progress(progress_scale(100, sub_progress), 100)

    def search_posts(self, post_ids, sub_progress=(0, 1)):
        results = []
        for doc in self._search_posts(post_ids, sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook', self.attributes,
                                  self.class_vars, self.post_metas,
                                  self.title_indices)
        c.set_text_features(self.text_features)
        return c

    def processComment(self, comment):
        has_comment_replies = 'comments' in comment.keys()
        parent = {
            'type': 'comment',
            'comment_id': comment['id'],
            'likes': comment['like']['summary']['total_count'],
            'comment_replies': None,
            'message': comment['message'],
            'parent_comment_id': ''
        }
        parent['status_published_utc'] = self.processDate(
            comment['created_time'])
        parent['status_published'] = self.utcToLocal(
            parent['status_published_utc'])
        if has_comment_replies:
            parent['comment_replies'] = comment['comments']['summary'][
                'total_count']
        yield parent

        if has_comment_replies:
            comment_replies = comment['comments']
            while True:
                for cr in comment_replies['data']:
                    child = {
                        'type': 'comment_reply',
                        'comment_id': comment['id'],
                        'likes': cr['like']['summary']['total_count'],
                        'message': cr['message'],
                        'parent_comment_id': cr['id'],
                        'comment_replies': None
                    }
                    child['status_published_utc'] = self.processDate(
                        cr['created_time'])
                    child['status_published'] = self.utcToLocal(
                        child['status_published_utc'])
                    yield child

                if not 'paging' in comment_replies.keys(): break
                if not 'next' in comment_replies['paging'].keys(): break
                url = comment_replies['paging']['next']
                comment_replies = self.getData(url)

    def _getComments(self,
                     post_ids,
                     comment_replies=True,
                     sub_progress=(0, 1)):
        for i, post_id in enumerate(post_ids):
            node = post_id + '/comments'
            url = self.buildUrl(node)

            params = {}
            params[
                'fields'] = 'message,created_time,reactions.type(LIKE).summary(true).as(like)'
            if comment_replies:
                params[
                    'fields'] += ',comments.summary(true){message,created_time,reactions.type(LIKE).summary(true).as(like)}'
            params['limit'] = 100

            while True:
                comments = self.getData(url, params=params)
                if len(comments['data']) == 0: break

                for comment in comments['data']:
                    for proc_comment in self.processComment(comment):
                        proc_comment['post_id'] = post_id
                        yield proc_comment

                if not 'paging' in comments.keys(): break
                if not 'next' in comments['paging'].keys(): break
                url = comments['paging']['next']
            progress = ((i + 1) / len(post_ids)) * 100
            self.on_progress(progress_scale(progress, sub_progress), 100)
        self.on_progress(progress_scale(100, sub_progress), 100)

    def getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)):
        attributes = []
        class_vars = []
        metas = [(data.StringVariable('Message'), lambda doc: doc['message']),
                 (data.DiscreteVariable('Type'), lambda doc: doc['type']),
                 (data.StringVariable('Post ID'), lambda doc: doc['post_id']),
                 (data.StringVariable('Comment ID'),
                  lambda doc: doc['comment_id']),
                 (data.StringVariable('Parent comment ID'),
                  lambda doc: doc['parent_comment_id']),
                 (data.ContinuousVariable('likes'), lambda doc: doc['likes']),
                 (data.ContinuousVariable('comment replies'),
                  lambda doc: doc['comment_replies']),
                 (data.TimeVariable('Publication Date'),
                  lambda doc: doc['status_published']),
                 (data.TimeVariable('Publication Date UTC'),
                  lambda doc: doc['status_published_utc'])]
        text_features = [metas[0][0]]
        title_indices = [-1]

        results = []
        for doc in self._getComments(post_ids, comment_replies, sub_progress):
            doc['status_published'] = doc['status_published'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            doc['status_published_utc'] = doc['status_published_utc'].strftime(
                '%Y-%m-%dT%H:%M:%S')
            results.append(doc)

        c = Corpus.from_documents(results, 'Facebook comments', attributes,
                                  class_vars, metas, title_indices)
        c.set_text_features(text_features)
        return c