def row_to_content_obj(key_row): '''Returns ``FeatureCollection`` given an HBase artifact row. Note that the FC returned has a Unicode feature ``artifact_id`` set to the row's key. ''' key, row = key_row cid = mk_content_id(key.encode('utf-8')) response = row.get('response', {}) other_bows = defaultdict(StringCounter) for attr, val in row.get('indices', []): other_bows[attr][val] += 1 try: artifact_id = key if isinstance(artifact_id, str): artifact_id = unicode(artifact_id, 'utf-8') fc = html_to_fc(response.get('body', ''), url=row.get('url'), timestamp=row.get('timestamp'), other_features=dict(other_bows, **{'artifact_id': artifact_id})) except: fc = None print('Could not create FC for %s:' % cid, file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) return cid, fc
def row_to_content_obj(key_row): '''Returns ``FeatureCollection`` given an HBase artifact row. Note that the FC returned has a Unicode feature ``artifact_id`` set to the row's key. ''' key, row = key_row cid = mk_content_id(key.encode('utf-8')) response = row.get('response', {}) other_bows = defaultdict(StringCounter) for attr, val in row.get('indices', []): other_bows[attr][val] += 1 try: artifact_id = key if isinstance(artifact_id, str): artifact_id = unicode(artifact_id, 'utf-8') fc = html_to_fc( response.get('body', ''), url=row.get('url'), timestamp=row.get('timestamp'), other_features=dict(other_bows, **{'artifact_id': artifact_id})) except: fc = None print('Could not create FC for %s:' % cid, file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) return cid, fc
def forum_post_id(row): ticks = forum_post_timestamp(row) abs_url = row['thread_link'] author = row['author'].get('username', 'unknown') return mk_content_id('|'.join(map(urlquote, [ticks, abs_url, author])))