コード例 #1
0
ファイル: nlp.py プロジェクト: Priya22/tweedr
    def __call__(self, tweet):
        text = tweet['text']
        tokens = token_re.findall(text)

        # tokens_features = map(list, featurize(tokens, crf_feature_functions))
        tokens_features = featurize(tokens, self.feature_functions)

        null_label = 'None'
        labels = self.crf.predict([tokens_features])[0]
        # tweet['labels'] = labels

        if 'sequences' not in tweet:
            tweet['sequences'] = []

        for sequence_label, entries in itertools.groupby(zip_boundaries(labels), lambda tup: tup[0]):
            if sequence_label != null_label:
                labels, starts, ends = zip(*entries)

                tweet['sequences'].append({
                    'text': sequence_label,
                    'start': starts[0],
                    'end': ends[-1],
                })

        return tweet
コード例 #2
0
ファイル: dbpedia.py プロジェクト: Priya22/tweedr
def spotlight(document, confidence=0.1, support=10):
    document_string = u' '.join(document)
    r = requests.post(spotlight_annotate_url,
        headers=dict(Accept='application/json'),
        data=dict(text=document_string, confidence=confidence, support=support))
    Resources = r.json().get('Resources', [])
    for token, token_start, token_end in zip_boundaries(document):
        labels = []
        for Resource in Resources:
            entity_start = int(Resource['@offset'])
            entity_end = entity_start + len(Resource['@surfaceForm'])

            if entity_start <= token_start <= entity_end or entity_start <= token_end <= entity_end:
                entity_uri = Resource['@URI']
                entity_types = Resource['@types'].split(',')
                labels += [entity_uri] + entity_types
        yield labels
コード例 #3
0
ファイル: dbpedia.py プロジェクト: wulfboy-95/tweedr
def spotlight(document, confidence=0.1, support=10):
    document_string = u' '.join(document)
    r = requests.post(spotlight_annotate_url,
                      headers=dict(Accept='application/json'),
                      data=dict(text=document_string,
                                confidence=confidence,
                                support=support))
    Resources = r.json().get('Resources', [])
    for token, token_start, token_end in zip_boundaries(document):
        labels = []
        for Resource in Resources:
            entity_start = int(Resource['@offset'])
            entity_end = entity_start + len(Resource['@surfaceForm'])

            if entity_start <= token_start <= entity_end or entity_start <= token_end <= entity_end:
                entity_uri = Resource['@URI']
                entity_types = Resource['@types'].split(',')
                labels += [entity_uri] + entity_types
        yield labels