Beispiel #1
0
def row_to_content_obj(key_row):
    '''Returns ``FeatureCollection`` given an HBase artifact row.

    Note that the FC returned has a Unicode feature ``artifact_id``
    set to the row's key.
    '''
    key, row = key_row
    cid = mk_content_id(key.encode('utf-8'))
    response = row.get('response', {})

    other_bows = defaultdict(StringCounter)
    for attr, val in row.get('indices', []):
        other_bows[attr][val] += 1
    try:
        artifact_id = key
        if isinstance(artifact_id, str):
            artifact_id = unicode(artifact_id, 'utf-8')
        fc = html_to_fc(response.get('body', ''),
                        url=row.get('url'),
                        timestamp=row.get('timestamp'),
                        other_features=dict(other_bows,
                                            **{'artifact_id': artifact_id}))
    except:
        fc = None
        print('Could not create FC for %s:' % cid, file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
    return cid, fc
Beispiel #2
0
def row_to_content_obj(key_row):
    '''Returns ``FeatureCollection`` given an HBase artifact row.

    Note that the FC returned has a Unicode feature ``artifact_id``
    set to the row's key.
    '''
    key, row = key_row
    cid = mk_content_id(key.encode('utf-8'))
    response = row.get('response', {})

    other_bows = defaultdict(StringCounter)
    for attr, val in row.get('indices', []):
        other_bows[attr][val] += 1
    try:
        artifact_id = key
        if isinstance(artifact_id, str):
            artifact_id = unicode(artifact_id, 'utf-8')
        fc = html_to_fc(
            response.get('body', ''),
            url=row.get('url'), timestamp=row.get('timestamp'),
            other_features=dict(other_bows, **{'artifact_id': artifact_id}))
    except:
        fc = None
        print('Could not create FC for %s:' % cid, file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
    return cid, fc
Beispiel #3
0
def feature_pipeline(chunk_in, FC_chunk_out):
    '''Run a basic pipeline to generate feature collections from
    streamitems. If file exists just loads the existing file. Returns
    a list of either the generated FCs or the FCs in the existing
    file.
    
    `chunk_in` path to SC chunk file

    `FC_chunk_out` path where the FC chunk file be written

    '''    
    if isfile(FC_chunk_out):
        print FC_chunk_out, 'already exists...',
        fcs = [fc for fc in FC_Chunk(FC_chunk_out, mode='rb')]
        print 'loaded.'
    else:
        chunk_out = FC_Chunk(FC_chunk_out, mode='wb')
        fcs = []
        for cfile in glob.glob(join(chunk_in,'*.sc.xz')):
            print 'processing', cfile
            for i, si in enumerate(SC_Chunk(cfile)):
                if i % 10==0: print i, 'fc processed'
                fc = html_to_fc(
                    html=si.body.raw,
                    encoding=si.body.encoding,
                    url=si.abs_url)
                chunk_out.add(fc)
                fcs.append(fc)

        print 'done creating', FC_chunk_out
    return fcs
Beispiel #4
0
def from_forum_post(row):
    cid = forum_post_id(row)
    try:
        fc = html_to_fc(row['content'].strip(),
                        url=row['thread_link'],
                        timestamp=forum_post_timestamp(row),
                        other_features=forum_post_features(row))
    except:
        fc = None
        print('Could not create FC for %s:' % cid, file=sys.stderr)
        print(traceback.format_exc())
    return cid, fc
Beispiel #5
0
def from_forum_post(row):
    cid = forum_post_id(row)
    try:
        fc = html_to_fc(row['content'].strip(),
                        url=row['thread_link'],
                        timestamp=forum_post_timestamp(row),
                        other_features=forum_post_features(row))
    except:
        fc = None
        print('Could not create FC for %s:' % cid, file=sys.stderr)
        print(traceback.format_exc())
    return cid, fc