def getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)): attributes = [] class_vars = [] metas = [(data.StringVariable('Message'), lambda doc: doc['message']), (data.DiscreteVariable('Type'), lambda doc: doc['type']), (data.StringVariable('Post ID'), lambda doc: doc['post_id']), (data.StringVariable('Comment ID'), lambda doc: doc['comment_id']), (data.StringVariable('Parent comment ID'), lambda doc: doc['parent_comment_id']), (data.ContinuousVariable('likes'), lambda doc: doc['likes']), (data.ContinuousVariable('comment replies'), lambda doc: doc['comment_replies']), (data.TimeVariable('Publication Date'), lambda doc: doc['status_published']), (data.TimeVariable('Publication Date UTC'), lambda doc: doc['status_published_utc'])] text_features = [metas[0][0]] title_indices = [-1] results = [] for doc in self._getComments(post_ids, comment_replies, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook comments', attributes, class_vars, metas, title_indices) c.set_text_features(text_features) return c
class NYT: """ Class for fetching records from the NYT API. """ @staticmethod def keywords(doc, name): return ', '.join([kw.get('value') for kw in doc.get('keywords', []) if kw['name'] == name]) attributes = [] class_vars = [ (data.DiscreteVariable('Section'), lambda doc: doc.get('section_name', None)), ] tv = data.TimeVariable('Publication Date') metas = [ (data.StringVariable('Headline'), lambda doc: doc.get('headline', {}).get('main') or ''), (data.StringVariable('Abstract'), lambda doc: doc.get('abstract') or ''), (data.StringVariable('Snippet'), lambda doc: doc.get('snippet') or ''), (data.StringVariable('Lead Paragraph'), lambda doc: doc.get('lead_paragraph') or ''), (data.StringVariable('Subject Keywords'), lambda doc: NYT.keywords(doc, 'subject')), (data.StringVariable('URL'), lambda doc: doc.get('web_url') or ''), (data.StringVariable('Locations'), lambda doc: NYT.keywords(doc, 'glocations')), (data.StringVariable('Persons'), lambda doc: NYT.keywords(doc, 'persons')), (data.StringVariable('Organizations'), lambda doc: NYT.keywords(doc, 'organizations')), (data.StringVariable('Creative Works'), lambda doc: NYT.keywords(doc, 'creative_works')), (tv, lambda doc: NYT.tv.parse(doc.get('pub_date'))), (data.DiscreteVariable('Article Type'), lambda doc: doc.get('type_of_material', None)), (data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc.get('word_count', None)), ] text_features = [metas[0][0], metas[1][0]] # headline + abstract def __init__(self, api_key): """ Args: api_key (str): NY Time API key. """ self.api_key = api_key self.on_error = None self.on_rate_limit = None self.on_no_connection = None self.cache_path = None self._cache_init() def api_key_valid(self): """ Checks whether api key given at initialization is valid. """ url = self._encode_url('test') try: with request.urlopen(url) as connection: if connection.getcode() == 200: return True except (HTTPError, URLError, HTTPException): return False def search(self, query, date_from=None, date_to=None, max_docs=None, on_progress=None, should_break=None): """ Args: query (str): Search query. date_from (date): Start date limit. date_to (date): End date limit. max_docs (int): Maximal number of documents returned. on_progress (callback): Called after every iteration of downloading. should_break (callback): Callback for breaking the computation before the end. If it evaluates to True, downloading is stopped and document downloaded till now are returned in a Corpus. Returns: Corpus: Search results. """ if max_docs is None or max_docs > MAX_DOCS: max_docs = MAX_DOCS # TODO create corpus on the fly and extend, so it stops faster. records = [] data, go_sleep = self._fetch_page(query, date_from, date_to, 0) if data is None: return None records.extend(data['response']['docs']) max_docs = min(data['response']['meta']['hits'], max_docs) if callable(on_progress): on_progress(len(records), max_docs) for page in range(1, math.ceil(max_docs/BATCH_SIZE)): if callable(should_break) and should_break(): break if go_sleep: sleep(SLEEP) data, go_sleep = self._fetch_page(query, date_from, date_to, page) if data is None: break records.extend(data['response']['docs']) if callable(on_progress): on_progress(len(records), max_docs) if len(records) > max_docs: records = records[:max_docs] return Corpus.from_documents(records, 'NY Times', self.attributes, self.class_vars, self.metas, title_indices=[-1]) def _cache_init(self): """ Initialize cache in Orange environment buffer dir. """ path = os.path.join(environ.cache_dir(), "nytcache") try: if not os.path.exists(path): os.makedirs(path) self.cache_path = os.path.join(path, "query_cache") except OSError as e: warnings.warn('Could not initialize NYT cache: {}'.format(str(e)), RuntimeWarning) def _cache_fetch(self, url): """ Fetch URL from cache if present. """ with shelve.open(self.cache_path) as cache: if url in cache.keys(): return cache[url] else: return None def _cache_store(self, url, data): """ Store data for URL in cache. """ with shelve.open(self.cache_path) as cache: cache[url] = data def _fetch_page(self, query, date_from, date_to, page): """ Fetch one page either from cache or web. """ cache_url = self._encode_url(query, date_from, date_to, page, for_caching=True) data = self._cache_fetch(cache_url) if data: return data, False else: url = self._encode_url(query, date_from, date_to, page, for_caching=False) try: with request.urlopen(url, timeout=TIMEOUT) as conn: data = conn.read().decode('utf-8') except HTTPError as e: if e.code == 403 and page > 0: # occasionally some pages return error 403 (Forbidden) # while all other page numbers seem to work just fine. # Skip such pages and don't break loading! warnings.warn('NYT api returned HTTPError with code 403 ' '(Forbidden)! Skipping this page ...') return {'response': {'docs': []}}, True if e.code == 429 and callable(self.on_rate_limit): self.on_rate_limit() elif callable(self.on_error): self.on_error(str(e)) return None, False except URLError: if callable(self.on_no_connection): self.on_no_connection() return None, False raise data = json.loads(data) self._cache_store(cache_url, data) return data, True def _encode_url(self, query, date_from=None, date_to=None, page=0, for_caching=False): """ Encode url for given query, date restrictions and page number. Args: query (str): Search query. date_from (date): Date restriction. date_to (date): Date restriction. page (int): Page number. for_caching (bool): Whether URL would be used for caching. If set, exclude BASE_URL and API key. Returns: str: An encoded URL. """ params = [ # list required to preserve order - important for caching ('fq', 'The New York Times'), ('api-key', self.api_key), ('q', query), ('page', page), ] if date_from: params.append(('begin_date', date_from.strftime('%Y%m%d'))) if date_to: params.append(('end_date', date_to.strftime('%Y%m%d'))) if for_caching: # remove api key, return only params del params[0] return parse.urlencode(params) else: return '{}?{}'.format(BASE_URL, parse.urlencode(params))
class TheGuardianAPI: attributes = [] class_vars = [ (data.DiscreteVariable('Section'), lambda doc: doc['sectionName']), ] tv = data.TimeVariable('Publication Date') metas = [ (data.StringVariable('Headline'), lambda doc: doc['fields']['headline']), (data.StringVariable('Content'), lambda doc: doc['fields']['bodyText']), (data.StringVariable('Trail Text'), lambda doc: doc['fields']['trailText']), (data.StringVariable('HTML'), lambda doc: doc['fields']['body']), (tv, lambda doc: TheGuardianAPI.tv.parse(doc['webPublicationDate'])), (data.DiscreteVariable('Type'), lambda doc: doc['type']), (data.DiscreteVariable('Language'), lambda doc: doc['fields']['lang']), (data.StringVariable('Tags'), lambda doc: ', '.join(tag['webTitle'] for tag in doc['tags'])), (data.StringVariable('URL'), lambda doc: doc['webUrl']), (data.ContinuousVariable('Word Count', number_of_decimals=0), lambda doc: doc['fields']['wordcount']), ] text_features = [metas[0][0], metas[1][0]] # Headline + Content title_indices = [-1] # Headline def __init__(self, credentials, on_progress=None, should_break=None): """ Args: credentials (:class:`TheGuardianCredentials`): The Guardian Creentials. on_progress (callable): Function for progress reporting. should_break (callable): Function for early stopping. """ self.per_page = ARTICLES_PER_PAGE self.pages = 0 self.credentials = credentials self.on_progress = on_progress or (lambda x, y: None) self.should_break = should_break or (lambda: False) self.results = [] def _search(self, query, from_date, to_date, page=1): data = self._build_query(query, from_date, to_date, page) response = requests.get(BASE_URL, data) parsed = json.loads(response.text) if page == 1: # store number of pages self.pages = parsed['response']['pages'] self.results.extend(parsed['response']['results']) def _build_query(self, query, from_date=None, to_date=None, page=1): data = { 'q': query, 'api-key': self.credentials.key, 'page': str(page), 'show-fields': 'headline,trailText,body,bodyText,lang,wordcount', 'show-tags': 'all', } if from_date is not None: data['from-date'] = from_date if to_date is not None: data['to-date'] = to_date return data def search(self, query, from_date=None, to_date=None, max_documents=None, accumulate=False): """ Search The Guardian API for articles. Args: query (str): A query for searching the articles by from_date (str): Search only articles newer than the date provided. Date should be in ISO format; e.g. '2016-12-31'. to_date (str): Search only articles older than the date provided. Date should be in ISO format; e.g. '2016-12-31'. max_documents (int): Maximum number of documents to retrieve. When not given, retrieve all documents. accumulate (bool): A flag indicating whether to accumulate results of multiple consequent search calls. Returns: :ref:`Corpus` """ if not accumulate: self.results = [] self._search(query, from_date, to_date) pages = math.ceil(max_documents/self.per_page) if max_documents else self.pages self.on_progress(self.per_page, pages * self.per_page) for p in range(2, pages+1): # to one based if self.should_break(): break self._search(query, from_date, to_date, p) self.on_progress(p*self.per_page, pages * self.per_page) c = Corpus.from_documents( self.results, 'The Guardian', self.attributes, self.class_vars, self.metas, title_indices=self.title_indices) c.text_features = self.text_features return c
class TwitterAPI: """ Fetch tweets from the Tweeter API. Notes: Results across multiple searches are aggregated. To remove tweets form previous searches and only return results from the last search either call `reset` method before searching or provide `collecting=False` argument to search method. """ attributes = [] class_vars = [ (data.DiscreteVariable('Author'), lambda doc: '@' + doc.author.screen_name), ] tv = data.TimeVariable('Date') metas = [ (data.StringVariable('Content'), lambda doc: doc.text), (tv, lambda doc: TwitterAPI.tv.parse(doc.created_at.isoformat())), (data.DiscreteVariable('Language'), lambda doc: doc.lang), (data.DiscreteVariable('Location'), lambda doc: getattr(doc.place, 'country_code', None)), (data.ContinuousVariable('Number of Likes', number_of_decimals=0), lambda doc: doc.favorite_count), (data.ContinuousVariable('Number of Retweets', number_of_decimals=0), lambda doc: doc.retweet_count), (data.DiscreteVariable('In Reply To'), lambda doc: '@' + doc.in_reply_to_screen_name if doc.in_reply_to_screen_name else ''), (data.DiscreteVariable('Author Name'), lambda doc: doc.author.name), (data.StringVariable('Author Description'), lambda doc: doc.author.description), (data.ContinuousVariable('Author Statuses Count', number_of_decimals=0), lambda doc: doc.author.statuses_count), (data.ContinuousVariable('Author Favourites Count', number_of_decimals=0), lambda doc: doc.author.favourites_count), (data.ContinuousVariable('Author Friends Count', number_of_decimals=0), lambda doc: doc.author.friends_count), (data.ContinuousVariable('Author Followers Count', number_of_decimals=0), lambda doc: doc.author.followers_count), (data.ContinuousVariable('Author Listed Count', number_of_decimals=0), lambda doc: doc.author.listed_count), (data.DiscreteVariable('Author Verified'), lambda doc: str(doc.author.verified)), (data.ContinuousVariable('Longitude'), lambda doc: coordinates_geoJSON(doc.coordinates)[0]), (data.ContinuousVariable('Latitude'), lambda doc: coordinates_geoJSON(doc.coordinates)[1]), ] text_features = [metas[0][0]] # Content string_attributes = [m for m, _ in metas if isinstance(m, data.StringVariable)] def __init__(self, credentials, on_progress=None, should_break=None, on_error=None, on_rate_limit=None): self.key = credentials self.api = tweepy.API(credentials.auth) self.container = OrderedDict() self.search_history = [] # Callbacks: self.on_error = on_error self.on_rate_limit = on_rate_limit self.on_progress = on_progress or (lambda *args: args) self.should_break = should_break or (lambda *args: False) @property def tweets(self): return self.container.values() def search_content(self, content, *, max_tweets=0, lang=None, allow_retweets=True, collecting=False): """ Search by content. Args: content (list of str): A list of key words to search for. max_tweets (int): If greater than zero limits the number of downloaded tweets. lang (str): A language's code (either ISO 639-1 or ISO 639-3 formats). allow_retweets(bool): Whether to download retweets. collecting (bool): Whether to collect results across multiple search calls. Returns: Corpus """ if not collecting: self.reset() if max_tweets == 0: max_tweets = float('Inf') def build_query(): nonlocal content if not content: q = 'from: ' else: if not isinstance(content, list): content = [content] q = ' OR '.join(['"{}"'.format(q) for q in content]) if not allow_retweets: q += ' -filter:retweets' return q query = build_query() cursor = tweepy.Cursor(self.api.search, q=query, lang=lang) corpus, count = self.fetch(cursor, max_tweets) self.append_history('Content', content, lang if lang else 'Any', str(allow_retweets), count) return corpus def search_authors(self, authors, *, max_tweets=0, collecting=False): """ Search by authors. Args: authors (list of str): A list of authors to search for. max_tweets (int): If greater than zero limits the number of downloaded tweets. collecting (bool): Whether to collect results across multiple search calls. Returns: Corpus """ if not collecting: self.reset() if max_tweets == 0: # set to max allowed for progress max_tweets = 3200 if not isinstance(authors, list): authors = [authors] cursors = [tweepy.Cursor(self.api.user_timeline, screen_name=a) for a in authors] corpus, count = self.fetch(cursors, max_tweets) self.append_history('Author', authors, None, None, count) return corpus def fetch(self, cursors, max_tweets): if not isinstance(cursors, list): cursors = [cursors] count = 0 try: for i, cursor in enumerate(cursors): for j, tweet in enumerate(cursor.items(max_tweets), start=1): if self.should_break(): break if tweet.id not in self.container: count += 1 self.container[tweet.id] = tweet if j % 20 == 0: self.on_progress(len(self.container), (i*max_tweets + j)/ (len(cursors)*max_tweets)) if self.should_break(): break except tweepy.TweepError as e: if e.response.status_code == 429 and self.on_rate_limit: self.on_rate_limit() elif self.on_error: self.on_error(str(e)) return None, 0 return self.create_corpus(), count def create_corpus(self): return Corpus.from_documents(self.tweets, 'Twitter', self.attributes, self.class_vars, self.metas, title_indices=[-1]) def reset(self): """ Removes all downloaded tweets. """ self.search_history = [] self.container = OrderedDict() def append_history(self, mode, query, lang, allow_retweets, n_tweets): query = ', '.join(query) if isinstance(query, Iterable) else query if lang in code2lang.keys(): lang = code2lang[lang] self.search_history.append(( ('Query', query), ('Search by', mode), ('Language', lang), ('Allow retweets', allow_retweets), ('Tweets count', n_tweets), )) def report(self): return self.search_history
class FacebookOrangeAPI(): attributes = [] class_vars = [] image_var = data.StringVariable.make("image") image_var.attributes["type"] = "image" post_metas = [ (data.StringVariable('Message'), lambda doc: doc['status_message']), (data.DiscreteVariable('From'), lambda doc: doc['from_name']), (data.ContinuousVariable('likes'), lambda doc: doc['like']), (data.ContinuousVariable('comments'), lambda doc: doc['comments']), (data.ContinuousVariable('shares'), lambda doc: doc['shares']), (data.DiscreteVariable('top emotion'), lambda doc: doc['top_reaction']), (data.StringVariable('Link name'), lambda doc: doc['link_name']), (image_var, lambda doc: doc['picture']), (data.StringVariable('link'), lambda doc: doc['status_link']), (data.DiscreteVariable('From ID'), lambda doc: doc['from_id']), (data.StringVariable('Post ID'), lambda doc: doc['status_id']), (data.DiscreteVariable('Post type'), lambda doc: doc['status_type']), (data.TimeVariable('Publication Date'), lambda doc: doc['status_published']), (data.TimeVariable('Publication Date UTC'), lambda doc: doc['status_published_utc']), (data.ContinuousVariable('emotion angry'), lambda doc: doc['angry']), (data.ContinuousVariable('emotion love'), lambda doc: doc['love']), (data.ContinuousVariable('emotion haha'), lambda doc: doc['haha']), (data.ContinuousVariable('emotion wow'), lambda doc: doc['wow']), (data.ContinuousVariable('emotion sad'), lambda doc: doc['sad']) ] text_features = [post_metas[0][0]] title_indices = [-1] def __init__(self, credentials, on_progress=None, should_break=None): self.utc_datecor = datetime.utcnow() - datetime.now() self.pages = 0 self.credentials = credentials self.on_progress = on_progress or (lambda x, y: None) self.should_break = should_break or (lambda: False) def buildUrl(self, node, version='v2.11'): return BASE_URL + '/' + version + '/' + node def getData(self, url, params=None): while True: if self.should_break(): return {} try: headers = {'Authorization': 'Bearer ' + self.credentials.token} p = requests.get(url, params=params, headers=headers) return p.json() except: print('retry in 5 sec') for i in range(50): if self.should_break(): return {} time.sleep(0.1) def localToUtc(self, date): return date + self.utc_datecor def utcToLocal(self, date): return date - self.utc_datecor def processDate(self, created_time): return datetime.strptime(created_time, '%Y-%m-%dT%H:%M:%S+0000') def processStatus(self, status, engagement=True): d = {} d['status_id'] = status['id'] d['from_id'] = status['from']['id'] if 'from' in status.keys() else '' d['from_name'] = status['from']['name'] if 'from' in status.keys( ) else '' d['status_message'] = '' if 'message' not in status.keys( ) else status['message'] d['status_type'] = status['type'] d['link_name'] = '' if 'name' not in status.keys() else status['name'] d['status_published_utc'] = self.processDate(status['created_time']) d['status_published'] = self.utcToLocal(d['status_published_utc']) d['status_link'] = '' if 'link' not in status.keys( ) else status['link'] d['picture'] = status['full_picture'] if 'full_picture' in status.keys( ) else '' topscore = 0 d['like'] = status['like']['summary'][ 'total_count'] if engagement else '' d['comments'] = status['comments']['summary'][ 'total_count'] if engagement else '' d['shares'] = status['shares']['count'] if 'shares' in status.keys( ) else '' d['top_reaction'] = '' for score in ['love', 'haha', 'wow', 'sad', 'angry']: d[score] = status[score]['summary'][ 'total_count'] if engagement else '' if engagement: d[score] = status[score]['summary']['total_count'] if int(d[score]) > topscore: topscore = int(d[score]) d['top_reaction'] = score else: d[score] = '' d['top_reaction'] = '' return d def fieldString(self, engagement=True): field_string = 'message,from,link,created_time,type,name,id,full_picture' if engagement: field_string += ',' + 'comments.limit(0).summary(true),shares.limit(0).summary(true)' for r in ['like', 'love', 'haha', 'wow', 'sad', 'angry']: field_string += ',' + 'reactions.type({}).limit(0).summary(true).as({})'.format( r.upper(), r.lower()) return field_string def getStatuses(self, page_id, mode='posts', since=None, until=None, engagement=True, comments=True): node = page_id + '/' + mode + '/' ## mode can be "posts" (posts by page), "feed" (all posts on page) and "tagged" (all public posts in which page is tagged url = self.buildUrl(node) params = {} params['fields'] = self.fieldString(engagement) params['limit'] = 100 if since is not None: params['since'] = ( self.localToUtc(since)).strftime('%Y-%m-%dT%H:%M:%S') if until is not None: params['until'] = ( self.localToUtc(until)).strftime('%Y-%m-%dT%H:%M:%S') while True: statuses = self.getData(url, params=params) if not 'data' in statuses: break proc_statuses = [ self.processStatus(s, engagement) for s in statuses['data'] ] yield proc_statuses if not 'paging' in statuses.keys(): break if not 'next' in statuses['paging'].keys(): break url = statuses['paging']['next'] def _search(self, page_ids, mode, since, until, max_documents, sub_progress=(0, 1)): since = since.strftime('%Y-%m-%d') until = until.strftime('%Y-%m-%d') since = datetime.strptime(since, '%Y-%m-%d') until = datetime.strptime(until + 'T23:59:59', '%Y-%m-%dT%H:%M:%S') total_sec = float((until - since).total_seconds()) n_pages = len(page_ids) progress_pct = 1 / float(n_pages) for page_i in range(0, n_pages): page_id = page_ids[page_i].strip() if page_id == '': return if '/' in page_id: page_id = page_id.split('/')[-1] page_progress = progress_pct * page_i n = 0 for d in self.getStatuses(page_id, mode, since, until): if self.should_break(): return earliest_date = d[-1]['status_published'] sec_to_go = (until - earliest_date).total_seconds() date_progress = ((sec_to_go / total_sec) * progress_pct) progress = math.ceil((page_progress + date_progress) * 100) self.on_progress(progress_scale(progress, sub_progress), 100) for doc in d: n += 1 if max_documents is not None: if n > max_documents: break yield doc if max_documents is not None: if n > max_documents: break self.on_progress(progress_scale(100, sub_progress), 100) def search(self, page_ids, mode='posts', since=datetime.now() - timedelta(10), until=datetime.now(), max_documents=None, sub_progress=(0, 1)): results = [] for doc in self._search(page_ids, mode, since, until, max_documents, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook', self.attributes, self.class_vars, self.post_metas, self.title_indices) c.set_text_features(self.text_features) return c def _search_posts(self, post_ids, sub_progress=(0, 1), engagement=True): for i, post_id in enumerate(post_ids): node = post_id url = self.buildUrl(node) params = {} params['fields'] = self.fieldString(engagement) params['limit'] = 100 status = self.getData(url, params=params) status = self.processStatus(status) yield status progress = ((i + 1) / len(post_ids)) * 100 self.on_progress(progress_scale(progress, sub_progress), 100) self.on_progress(progress_scale(100, sub_progress), 100) def search_posts(self, post_ids, sub_progress=(0, 1)): results = [] for doc in self._search_posts(post_ids, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook', self.attributes, self.class_vars, self.post_metas, self.title_indices) c.set_text_features(self.text_features) return c def processComment(self, comment): has_comment_replies = 'comments' in comment.keys() parent = { 'type': 'comment', 'comment_id': comment['id'], 'likes': comment['like']['summary']['total_count'], 'comment_replies': None, 'message': comment['message'], 'parent_comment_id': '' } parent['status_published_utc'] = self.processDate( comment['created_time']) parent['status_published'] = self.utcToLocal( parent['status_published_utc']) if has_comment_replies: parent['comment_replies'] = comment['comments']['summary'][ 'total_count'] yield parent if has_comment_replies: comment_replies = comment['comments'] while True: for cr in comment_replies['data']: child = { 'type': 'comment_reply', 'comment_id': comment['id'], 'likes': cr['like']['summary']['total_count'], 'message': cr['message'], 'parent_comment_id': cr['id'], 'comment_replies': None } child['status_published_utc'] = self.processDate( cr['created_time']) child['status_published'] = self.utcToLocal( child['status_published_utc']) yield child if not 'paging' in comment_replies.keys(): break if not 'next' in comment_replies['paging'].keys(): break url = comment_replies['paging']['next'] comment_replies = self.getData(url) def _getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)): for i, post_id in enumerate(post_ids): node = post_id + '/comments' url = self.buildUrl(node) params = {} params[ 'fields'] = 'message,created_time,reactions.type(LIKE).summary(true).as(like)' if comment_replies: params[ 'fields'] += ',comments.summary(true){message,created_time,reactions.type(LIKE).summary(true).as(like)}' params['limit'] = 100 while True: comments = self.getData(url, params=params) if len(comments['data']) == 0: break for comment in comments['data']: for proc_comment in self.processComment(comment): proc_comment['post_id'] = post_id yield proc_comment if not 'paging' in comments.keys(): break if not 'next' in comments['paging'].keys(): break url = comments['paging']['next'] progress = ((i + 1) / len(post_ids)) * 100 self.on_progress(progress_scale(progress, sub_progress), 100) self.on_progress(progress_scale(100, sub_progress), 100) def getComments(self, post_ids, comment_replies=True, sub_progress=(0, 1)): attributes = [] class_vars = [] metas = [(data.StringVariable('Message'), lambda doc: doc['message']), (data.DiscreteVariable('Type'), lambda doc: doc['type']), (data.StringVariable('Post ID'), lambda doc: doc['post_id']), (data.StringVariable('Comment ID'), lambda doc: doc['comment_id']), (data.StringVariable('Parent comment ID'), lambda doc: doc['parent_comment_id']), (data.ContinuousVariable('likes'), lambda doc: doc['likes']), (data.ContinuousVariable('comment replies'), lambda doc: doc['comment_replies']), (data.TimeVariable('Publication Date'), lambda doc: doc['status_published']), (data.TimeVariable('Publication Date UTC'), lambda doc: doc['status_published_utc'])] text_features = [metas[0][0]] title_indices = [-1] results = [] for doc in self._getComments(post_ids, comment_replies, sub_progress): doc['status_published'] = doc['status_published'].strftime( '%Y-%m-%dT%H:%M:%S') doc['status_published_utc'] = doc['status_published_utc'].strftime( '%Y-%m-%dT%H:%M:%S') results.append(doc) c = Corpus.from_documents(results, 'Facebook comments', attributes, class_vars, metas, title_indices) c.set_text_features(text_features) return c