Ejemplo n.º 1
0
def row_to_content_obj(key_row):
    '''Returns ``FeatureCollection`` given an HBase artifact row.

    Note that the FC returned has a Unicode feature ``artifact_id``
    set to the row's key.
    '''
    key, row = key_row
    cid = mk_content_id(key.encode('utf-8'))
    response = row.get('response', {})

    other_bows = defaultdict(StringCounter)
    for attr, val in row.get('indices', []):
        other_bows[attr][val] += 1
    try:
        artifact_id = key
        if isinstance(artifact_id, str):
            artifact_id = unicode(artifact_id, 'utf-8')
        fc = html_to_fc(response.get('body', ''),
                        url=row.get('url'),
                        timestamp=row.get('timestamp'),
                        other_features=dict(other_bows,
                                            **{'artifact_id': artifact_id}))
    except:
        fc = None
        print('Could not create FC for %s:' % cid, file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
    return cid, fc
Ejemplo n.º 2
0
def row_to_content_obj(key_row):
    '''Returns ``FeatureCollection`` given an HBase artifact row.

    Note that the FC returned has a Unicode feature ``artifact_id``
    set to the row's key.
    '''
    key, row = key_row
    cid = mk_content_id(key.encode('utf-8'))
    response = row.get('response', {})

    other_bows = defaultdict(StringCounter)
    for attr, val in row.get('indices', []):
        other_bows[attr][val] += 1
    try:
        artifact_id = key
        if isinstance(artifact_id, str):
            artifact_id = unicode(artifact_id, 'utf-8')
        fc = html_to_fc(
            response.get('body', ''),
            url=row.get('url'), timestamp=row.get('timestamp'),
            other_features=dict(other_bows, **{'artifact_id': artifact_id}))
    except:
        fc = None
        print('Could not create FC for %s:' % cid, file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
    return cid, fc
Ejemplo n.º 3
0
def forum_post_id(row):
    ticks = forum_post_timestamp(row)
    abs_url = row['thread_link']
    author = row['author'].get('username', 'unknown')
    return mk_content_id('|'.join(map(urlquote, [ticks, abs_url, author])))
Ejemplo n.º 4
0
def forum_post_id(row):
    ticks = forum_post_timestamp(row)
    abs_url = row['thread_link']
    author = row['author'].get('username', 'unknown')
    return mk_content_id('|'.join(map(urlquote, [ticks, abs_url, author])))