def test_target_parsing():
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    test_html = open(os.path.join(path, 'target-test.html')).read()

    html = make_clean_html( test_html )

    assert 'logo' in html
    assert 'target' in html

    visible = make_clean_visible( html )
    
    assert 'logo' not in visible
    assert 'target' not in visible

    hyperlink_labels = _init_stage(
        'hyperlink_labels', 
        dict(offset_types=['LINES'],
             require_abs_url=True,
             all_domains=True,
             ))
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    hyperlink_labels( si, context )
    html2 = si.body.clean_html

    visible2 = make_clean_visible( html2 )
    
    #print visible2

    assert 'target' not in visible2
    assert 'logo' not in visible2
Exemple #2
0
def test_target_parsing(test_data_dir):
    path = os.path.join(test_data_dir, 'test')
    test_html = open(os.path.join(path, 'target-test.html')).read()

    html = make_clean_html( test_html )

    assert 'logo' in html
    assert 'target' in html

    visible = make_clean_visible( html )

    assert 'logo' not in visible
    assert 'target' not in visible

    stage = hyperlink_labels(config={
        'offset_types': ['LINES'],
        'require_abs_url': True,
        'all_domains': True,
    })
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    stage( si, context )
    html2 = si.body.clean_html

    visible2 = make_clean_visible( html2 )

    #print visible2

    assert 'target' not in visible2
    assert 'logo' not in visible2
Exemple #3
0
def test_handles(eval_data):
    for text, expected in eval_data:
        text = make_clean_html(text)
        text = make_clean_visible(text)
        sc = extract_user_names(text)

        assert set(sc) == set(expected)
    def fill_more_slots(self,
                        slots,
                        text,
                        save=False,
                        phone=True,
                        skype=True,
                        twitter=True,
                        si=None):
        clean_html = make_clean_html(text, stream_item=si)
        text = make_clean_visible(clean_html)

        if save:
            open('foobar-%s.txt' % md5(text).hexdigest(), 'wb').write(text)

        for key in [
                'phone', 'phone_raw', 'Skype', 'Twitter', 'email', 'keywords'
        ]:
            if key not in slots:
                slots[key] = set()

        email_matches = list(email_matcher(text))
        if email_matches:
            email_match = email_matches[0][CANONICAL]
            slots['email'].add(email_match)

        if phone:
            phone_matches = list(phonenumber_matcher(text, country='US'))
            for phone_match in phone_matches:
                slots['phone'].add(phone_match[CANONICAL])
                slots['phone_raw'].add(phone_match[RAW])

        if skype:
            skype_matches = list(skype_matcher(text))
            for skype_match in skype_matches:
                slots['Skype'].add(skype_match[CANONICAL])

        if twitter:
            twitter_matches = list(twitter_matcher(text))
            for twitter_match in twitter_matches:
                slots['Twitter'].add(twitter_match[CANONICAL])

        for tok in text.split():  # assume non-CJK
            if prob_username(tok.lower(), self.char_unigrams,
                             self.char_bigrams) > 0.5:
                slots['keywords'].add(tok)

        for key, val in slots.items():
            if not val:
                slots.pop(key)

        return slots
Exemple #5
0
def test_unicode_conversion(test_data_dir):
    path = os.path.join(test_data_dir, 'test')
    test_html = open(os.path.join(path, 'raw-unicode-issues.html')).read()

    print type(test_html)
    print test_html
    print repr(test_html)
    print str(test_html).decode('utf8')

    html = make_clean_html( test_html )

    print unicode(html)

    visible = make_clean_visible( html )

    print type(visible)

    print visible.decode('utf8')
def test_unicode_conversion():
    path = os.path.dirname(__file__)
    path = os.path.join( path, _TEST_DATA_ROOT, 'test' )
    test_html = open(os.path.join(path, 'raw-unicode-issues.html')).read()

    print type(test_html)
    print test_html
    print repr(test_html)
    print str(test_html).decode('utf8')

    html = make_clean_html( test_html )

    print unicode(html)

    visible = make_clean_visible( html )

    print type(visible)

    print visible.decode('utf8')
Exemple #7
0
def unpack_noun_phrases(row):
    body = cbor.loads(zlib.decompress(row['f:response.body']))
    body = make_clean_visible(body.encode('utf-8')).decode('utf-8')
    body = cleanse(body)
    return features.noun_phrases(body)
Exemple #8
0
def unpack_noun_phrases(row):
    body = cbor.loads(zlib.decompress(row['f:response.body']))
    body = make_clean_visible(body.encode('utf-8')).decode('utf-8')
    body = cleanse(body)
    return features.noun_phrases(body)
def html_to_fc(html=None,
               clean_html=None,
               clean_visible=None,
               encoding=None,
               url=None,
               timestamp=None,
               other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except:
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(cleanse(clean_visible),
                                                  included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc
Exemple #10
0
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None,
               timestamp=None, other_features=None):
    '''`html` is expected to be a raw string received over the wire from a
    remote webserver, and `encoding`, if provided, is used to decode
    it.  Typically, encoding comes from the Content-Type header field.
    The :func:`~streamcorpus_pipeline._clean_html.make_clean_html`
    function handles character encodings.

    '''
    def add_feature(name, xs):
        if name not in fc:
            fc[name] = StringCounter()
        fc[name] += StringCounter(xs)

    timestamp = timestamp or int(time.time() * 1000)
    other_features = other_features or {}

    if clean_html is None:
        if html is not None:
            try:
                clean_html_utf8 = make_clean_html(html, encoding=encoding)
            except: 
                logger.warn('dropping doc because:', exc_info=True)
                return
            clean_html = clean_html_utf8.decode('utf-8')
        else:
            clean_html_utf8 = u''
            clean_html = u''
    else:
        clean_html_utf8 = u''

    if clean_visible is None or len(clean_visible) == 0:
        clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8')
    elif isinstance(clean_visible, str):
        clean_visible = clean_visible.decode('utf-8')

    fc = FeatureCollection()
    fc[u'meta_raw'] = html and uni(html, encoding) or u''
    fc[u'meta_clean_html'] = clean_html
    fc[u'meta_clean_visible'] = clean_visible
    fc[u'meta_timestamp'] = unicode(timestamp)

    url = url or u''

    fc[u'meta_url'] = uni(url)

    add_feature(u'icq', features.ICQs(clean_visible))
    add_feature(u'skype', features.skypes(clean_visible))
    add_feature(u'phone', features.phones(clean_visible))
    add_feature(u'email', features.emails(clean_visible))
    bowNP, normalizations = features.noun_phrases(
        cleanse(clean_visible), included_unnormalized=True)
    add_feature(u'bowNP', bowNP)
    bowNP_unnorm = chain(*normalizations.values())
    add_feature(u'bowNP_unnorm', bowNP_unnorm)

    add_feature(u'image_url', features.image_urls(clean_html))
    add_feature(u'a_url', features.a_urls(clean_html))

    ## get parsed versions, extract usernames
    fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url'])
    fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url'])
    fc[u'usernames'] = features.usernames(fc[u'image_url'])

    fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url'])
    fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url'])

    fc[u'usernames'] += features.usernames(fc[u'a_url'])

    #fc[u'usernames'] += features.usernames2(
    #    fc[u'meta_clean_visible'])

    # beginning of treating this as a pipeline...
    xform = features.entity_names()
    fc = xform.process(fc)

    for feat_name, feat_val in other_features.iteritems():
        fc[feat_name] += StringCounter(feat_val)

    return fc
Exemple #11
0
    # print 'is a (simply) a username? %r' % simple.classify(username)
    # print 'is a (nb) a username? %r' % nb.classify(username)

    parser = argparse.ArgumentParser()
    parser.add_argument('positive', help='File containing the positive test examples.')
    parser.add_argument('--negative', default=None, help='File with negative examples. If omitted, will generate randomly.')
    parser.add_argument('--test-text', 
                        help=('name of entity for whom data has been saved: %r' % 
                              usernames_with_saved_data))
    args = yakonfig.parse_args(parser, [yakonfig, dblogger])

    if args.test_text:
        for eg, tr in load_eval_data(args.test_text):
            eg = make_clean_html(eg)
            eg = make_clean_visible(eg)
            sc = extract_user_names(eg)
            found = set(sc)
            expected = set(tr)
            TP = found.intersection(expected)
            FN = expected - found
            FP = found - expected
            print('TP: \n\t%s' % '\n\t'.join(TP))
            print('\n\nFN: \n\t%s' % '\n\t'.join(FN))
            print('\n\nFP: \n\t%s' % '\n\t'.join(FP))
            P = len(TP) / (len(TP) + len(FP))
            R = len(TP) / (len(TP) + len(FN))
            F = 2 * P * R / (P + R)
            print('F=%.4f, P=%.4f, R=%.4f, TP=%d, FN=%d, FP=%d' % (F, P, R, len(TP), len(FN), len(FP)))
            #print sc
        sys.exit()