def multi_token_match(stream_item, aligner_data): ''' iterate through tokens looking for near-exact matches to strings in si.ratings...mentions ''' sentences = stream_item.body.sentences.get(aligner_data['tagger_id']) if not sentences: return ## construct a list of tuples, where the first part of each tuple ## is a tuple of cleansed strings, and the second part is the ## Token object from which it came. tokens = map( lambda tok: (cleanse(tok.token.decode('utf8')).split(' '), tok), itertools.chain(*[sent.tokens for sent in sentences])) for annotator_id, ratings in stream_item.ratings.items(): if annotator_id == aligner_data['annotator_id']: for rating in ratings: label = Label(annotator=rating.annotator, target=rating.target) num_tokens_matched = 0 for tok in look_ahead_match(rating, tokens): if aligner_data.get('update_labels'): tok.labels.pop(annotator_id, None) add_annotation(tok, label) num_tokens_matched += 1 if num_tokens_matched == 0: logger.critical( 'failed multi_token_match %r:\n mentions: %r\n tokens: %r\n clean_html=%r', stream_item.abs_url, rating.mentions, tokens, stream_item.body.clean_html) else: logger.debug('matched %d tokens for %r', num_tokens_matched, rating.target.target_id)
def _make_stream_item(cls, path, metadata, abs_url, entities): ''' ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. creation_time = os.path.getctime(path) ## make stream item stream_item = streamcorpus.make_stream_item( creation_time, abs_url) stream_item.source = metadata.get('source') ## build a ContentItem for the body body = streamcorpus.ContentItem() body.media_type = magic.from_file(path, mime=True) logger.info('opening %r', path) with open(path) as f: body.raw = f.read() ## attach the content_item to the stream_item stream_item.body = body ## annotations anno = streamcorpus.Annotator() anno.annotator_id = metadata['annotator_id'] anno.annotation_time = stream_item.stream_time num_ratings = 0 for entity, is_profile in entities: num_ratings += 1 ## pull out target id and mention tokens target_id = str(entity['target_id']) ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id = target_id) rating.contains_mention = True if is_profile: rating.flags = [streamcorpus.FlagType.PROFILE] ## parse slots in yaml file slots = cls._parse_slots(entity['slots']) ## heuristically split the slots string on white space and ## use each token as a separate mention. rating.mentions = [cleanse(unicode(slot[1], 'utf-8')) for slot in slots] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url) return stream_item
def multi_token_match(stream_item, aligner_data): ''' iterate through tokens looking for near-exact matches to strings in si.ratings...mentions ''' sentences = stream_item.body.sentences.get(aligner_data['tagger_id']) if not sentences: return ## construct a list of tuples, where the first part of each tuple ## is a tuple of cleansed strings, and the second part is the ## Token object from which it came. tokens = map(lambda tok: (cleanse(tok.token.decode('utf8')).split(' '), tok), itertools.chain(*[sent.tokens for sent in sentences])) for annotator_id, ratings in stream_item.ratings.items(): if annotator_id == aligner_data['annotator_id']: for rating in ratings: label = Label(annotator=rating.annotator, target=rating.target) num_tokens_matched = 0 for tok in look_ahead_match(rating, tokens): if aligner_data.get('update_labels'): tok.labels.pop(annotator_id, None) add_annotation(tok, label) num_tokens_matched += 1 if num_tokens_matched == 0: logger.critical('failed multi_token_match %r:\n mentions: %r\n tokens: %r\n clean_html=%r', stream_item.abs_url, rating.mentions, tokens, stream_item.body.clean_html) else: logger.debug('matched %d tokens for %r', num_tokens_matched, rating.target.target_id)
def look_ahead_match(rating, tokens): ''' iterate through all tokens looking for matches of cleansed tokens, skipping tokens left empty by cleansing and coping with Token objects that produce multiple space-separated strings when cleansed. ''' ## this ensures that all cleansed tokens are non-zero length clean_mentions = [] for m in rating.mentions: mtoks = cleanse(m.decode('utf8')).split(' ') if mtoks and mtoks != ['']: clean_mentions.append(mtoks) else: logger.warn('got empty cleansed mention: %r\nrating=%r' % (m, rating)) for i in range(len(tokens)): for mtoks in clean_mentions: if tokens[i][0][0] == mtoks[0]: ## found the start of a possible match, so iterate ## through the tuples of cleansed strings for each ## Token while stepping through the cleansed strings ## for this mention. m_j = 1 i_j = 0 last_token_matched = 0 matched = True while m_j < len(mtoks): i_j += 1 if i_j == len(tokens[i + last_token_matched][0]): i_j = 0 last_token_matched += 1 if i + last_token_matched == len(tokens): matched = False break if mtoks[m_j] == tokens[i + last_token_matched][0][i_j]: m_j += 1 elif tokens[i + last_token_matched][0][i_j] == '': continue else: matched = False break if matched: ## yield each matched token only once toks = set() for j in xrange(last_token_matched + 1): toks.add(tokens[i + j][1]) for tok in toks: yield tok
def look_ahead_match(rating, tokens): ''' iterate through all tokens looking for matches of cleansed tokens, skipping tokens left empty by cleansing and coping with Token objects that produce multiple space-separated strings when cleansed. ''' ## this ensures that all cleansed tokens are non-zero length clean_mentions = [] for m in rating.mentions: mtoks = cleanse(m.decode('utf8')).split(' ') if mtoks and mtoks != ['']: clean_mentions.append(mtoks) else: logger.warn('got empty cleansed mention: %r\nrating=%r' % (m, rating)) for i in range(len(tokens)): for mtoks in clean_mentions: if tokens[i][0][0] == mtoks[0]: ## found the start of a possible match, so iterate ## through the tuples of cleansed strings for each ## Token while stepping through the cleansed strings ## for this mention. m_j = 1 i_j = 0 last_token_matched = 0 matched = True while m_j < len(mtoks): i_j += 1 if i_j == len(tokens[i + last_token_matched][0]): i_j = 0 last_token_matched += 1 if i + last_token_matched == len(tokens): matched = False break if mtoks[m_j] == tokens[i + last_token_matched][0][i_j]: m_j += 1 elif tokens[i + last_token_matched][0][i_j] == '': continue else: matched = False break if matched: ## yield each matched token only once toks = set() for j in xrange(last_token_matched + 1): toks.add(tokens[i + j][1]) for tok in toks: yield tok
def make_chains_with_names(sentences): ''' assemble in-doc coref chains by mapping equiv_id to tokens and their cleansed name strings :param sentences: iterator over token generators :returns dict: keys are equiv_ids, values are tuple(concatentated name string, list of tokens) ''' ## if an equiv_id is -1, then the token is classified into some ## entity_type but has not other tokens in its chain. We don't ## want these all lumped together, so we give them distinct "fake" ## equiv_id other than -1 -- counting negatively to avoid ## collisions with "real" equiv_ids fake_equiv_ids = -2 ## use a default dictionary equiv_ids = collections.defaultdict(lambda: (set(), set())) for tagger_id, sents in sentences.items(): for sent in sents: for tok in sent.tokens: if tok.entity_type is not None: ## get an appropriate equiv_id if tok.equiv_id == -1: eqid = fake_equiv_ids fake_equiv_ids -= 1 else: eqid = tok.equiv_id ## store the name parts initially as a set equiv_ids[eqid][0].add(cleanse(tok.token.decode('utf8'))) ## carry a *reference* to the entire Token object equiv_ids[eqid][1].add(tok) return equiv_ids
def make_chains_with_names(sentences): ''' assemble in-doc coref chains by mapping equiv_id to tokens and their cleansed name strings :param sentences: iterator over token generators :returns dict: keys are equiv_ids, values are tuple(concatentated name string, list of tokens) ''' ## if an equiv_id is -1, then the token is classified into some ## entity_type but has not other tokens in its chain. We don't ## want these all lumped together, so we give them distinct "fake" ## equiv_id other than -1 -- counting negatively to avoid ## collisions with "real" equiv_ids fake_equiv_ids = -2 ## use a default dictionary equiv_ids = collections.defaultdict(lambda: (set(), set())) for tagger_id, sents in sentences.items(): for sent in sents: for tok in sent.tokens: if tok.entity_type is not None: ## get an appropriate equiv_id if tok.equiv_id == -1: eqid = fake_equiv_ids fake_equiv_ids -= 1 else: eqid = tok.equiv_id ## store the name parts initially as a set equiv_ids[eqid][0].add(cleanse(tok.token.decode('utf8'))) ## carry a *reference* to the entire Token object equiv_ids[eqid][1].add(tok) return equiv_ids
def unpack_noun_phrases(row): body = cbor.loads(zlib.decompress(row['f:response.body'])) body = make_clean_visible(body.encode('utf-8')).decode('utf-8') body = cleanse(body) return features.noun_phrases(body)
def _make_stream_item(cls, path, metadata, abs_url, entities): ''' ''' ## Every StreamItem has a stream_time property. It usually comes ## from the document creation time. creation_time = os.path.getctime(path) ## make stream item stream_item = streamcorpus.make_stream_item(creation_time, abs_url) stream_item.source = metadata.get('source') ## build a ContentItem for the body body = streamcorpus.ContentItem() body.media_type = magic.from_file(path, mime=True) logger.info('opening %r', path) with open(path) as f: body.raw = f.read() ## attach the content_item to the stream_item stream_item.body = body ## annotations anno = streamcorpus.Annotator() anno.annotator_id = metadata['annotator_id'] anno.annotation_time = stream_item.stream_time num_ratings = 0 for entity, is_profile in entities: num_ratings += 1 ## pull out target id and mention tokens target_id = str(entity['target_id']) ## build a Label for the doc-level label: rating = streamcorpus.Rating() rating.annotator = anno rating.target = streamcorpus.Target(target_id=target_id) rating.contains_mention = True if is_profile: rating.flags = [streamcorpus.FlagType.PROFILE] ## parse slots in yaml file slots = cls._parse_slots(entity['slots']) ## heuristically split the slots string on white space and ## use each token as a separate mention. rating.mentions = [ cleanse(unicode(slot[1], 'utf-8')) for slot in slots ] ## put this one label in the array of labels streamcorpus.add_annotation(stream_item, rating) ## provide this stream_item to the pipeline logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url) return stream_item
def unpack_noun_phrases(row): body = cbor.loads(zlib.decompress(row['f:response.body'])) body = make_clean_visible(body.encode('utf-8')).decode('utf-8') body = cleanse(body) return features.noun_phrases(body)
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None, timestamp=None, other_features=None): '''`html` is expected to be a raw string received over the wire from a remote webserver, and `encoding`, if provided, is used to decode it. Typically, encoding comes from the Content-Type header field. The :func:`~streamcorpus_pipeline._clean_html.make_clean_html` function handles character encodings. ''' def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs) timestamp = timestamp or int(time.time() * 1000) other_features = other_features or {} if clean_html is None: if html is not None: try: clean_html_utf8 = make_clean_html(html, encoding=encoding) except: logger.warn('dropping doc because:', exc_info=True) return clean_html = clean_html_utf8.decode('utf-8') else: clean_html_utf8 = u'' clean_html = u'' else: clean_html_utf8 = u'' if clean_visible is None or len(clean_visible) == 0: clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8') elif isinstance(clean_visible, str): clean_visible = clean_visible.decode('utf-8') fc = FeatureCollection() fc[u'meta_raw'] = html and uni(html, encoding) or u'' fc[u'meta_clean_html'] = clean_html fc[u'meta_clean_visible'] = clean_visible fc[u'meta_timestamp'] = unicode(timestamp) url = url or u'' fc[u'meta_url'] = uni(url) add_feature(u'icq', features.ICQs(clean_visible)) add_feature(u'skype', features.skypes(clean_visible)) add_feature(u'phone', features.phones(clean_visible)) add_feature(u'email', features.emails(clean_visible)) bowNP, normalizations = features.noun_phrases(cleanse(clean_visible), included_unnormalized=True) add_feature(u'bowNP', bowNP) bowNP_unnorm = chain(*normalizations.values()) add_feature(u'bowNP_unnorm', bowNP_unnorm) add_feature(u'image_url', features.image_urls(clean_html)) add_feature(u'a_url', features.a_urls(clean_html)) ## get parsed versions, extract usernames fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url']) fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url']) fc[u'usernames'] = features.usernames(fc[u'image_url']) fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url']) fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url']) fc[u'usernames'] += features.usernames(fc[u'a_url']) #fc[u'usernames'] += features.usernames2( # fc[u'meta_clean_visible']) # beginning of treating this as a pipeline... xform = features.entity_names() fc = xform.process(fc) for feat_name, feat_val in other_features.iteritems(): fc[feat_name] += StringCounter(feat_val) return fc
def html_to_fc(html=None, clean_html=None, clean_visible=None, encoding=None, url=None, timestamp=None, other_features=None): '''`html` is expected to be a raw string received over the wire from a remote webserver, and `encoding`, if provided, is used to decode it. Typically, encoding comes from the Content-Type header field. The :func:`~streamcorpus_pipeline._clean_html.make_clean_html` function handles character encodings. ''' def add_feature(name, xs): if name not in fc: fc[name] = StringCounter() fc[name] += StringCounter(xs) timestamp = timestamp or int(time.time() * 1000) other_features = other_features or {} if clean_html is None: if html is not None: try: clean_html_utf8 = make_clean_html(html, encoding=encoding) except: logger.warn('dropping doc because:', exc_info=True) return clean_html = clean_html_utf8.decode('utf-8') else: clean_html_utf8 = u'' clean_html = u'' else: clean_html_utf8 = u'' if clean_visible is None or len(clean_visible) == 0: clean_visible = make_clean_visible(clean_html_utf8).decode('utf-8') elif isinstance(clean_visible, str): clean_visible = clean_visible.decode('utf-8') fc = FeatureCollection() fc[u'meta_raw'] = html and uni(html, encoding) or u'' fc[u'meta_clean_html'] = clean_html fc[u'meta_clean_visible'] = clean_visible fc[u'meta_timestamp'] = unicode(timestamp) url = url or u'' fc[u'meta_url'] = uni(url) add_feature(u'icq', features.ICQs(clean_visible)) add_feature(u'skype', features.skypes(clean_visible)) add_feature(u'phone', features.phones(clean_visible)) add_feature(u'email', features.emails(clean_visible)) bowNP, normalizations = features.noun_phrases( cleanse(clean_visible), included_unnormalized=True) add_feature(u'bowNP', bowNP) bowNP_unnorm = chain(*normalizations.values()) add_feature(u'bowNP_unnorm', bowNP_unnorm) add_feature(u'image_url', features.image_urls(clean_html)) add_feature(u'a_url', features.a_urls(clean_html)) ## get parsed versions, extract usernames fc[u'img_url_path_dirs'] = features.path_dirs(fc[u'image_url']) fc[u'img_url_hostnames'] = features.host_names(fc[u'image_url']) fc[u'usernames'] = features.usernames(fc[u'image_url']) fc[u'a_url_path_dirs'] = features.path_dirs(fc[u'a_url']) fc[u'a_url_hostnames'] = features.host_names(fc[u'a_url']) fc[u'usernames'] += features.usernames(fc[u'a_url']) #fc[u'usernames'] += features.usernames2( # fc[u'meta_clean_visible']) # beginning of treating this as a pipeline... xform = features.entity_names() fc = xform.process(fc) for feat_name, feat_val in other_features.iteritems(): fc[feat_name] += StringCounter(feat_val) return fc
def test_cleanse(): assert cleanse(u'This -LRB-big-RRB- dog has no \u1F601 Teeth' ) == u'this big dog has no \u1F601 teeth'
def test_cleanse(): assert cleanse(u'This -LRB-big-RRB- dog has no \u1F601 Teeth') == u'this big dog has no \u1F601 teeth'