コード例 #1
0
ファイル: babelfy.py プロジェクト: waterzxj/learningbyreading
def babelfy(text, key):
    params = {"lang": "en"}
    babel_client = BabelfyClient(key, params)
    babel_client.babelfy(text)

    synsets = []
    entities = []

    for entity in babel_client.merged_entities:
        token_start = entity["tokenFragment"]["start"]
        token_end = entity["tokenFragment"]["end"]
        synset = entity["babelSynsetID"]
        lemma = entity["text"]
        synsets.append({
            'token_start': token_start,
            'token_end': token_end,
            'synset': synset,
            'lemma': lemma
        })
        entities.append({
            'token_start': token_start,
            'token_end': token_end,
            'entity': synset,
            'lemma': lemma
        })
    return {'synsets': synsets, 'entities': entities}
コード例 #2
0
 def __init__(self,
              field_to_link: str,
              api_key: str = None,
              lang: str = "EN"):
     super().__init__("all_retrieved")
     self.__field_to_link = field_to_link
     self.__api_key = api_key
     self.__babel_client = BabelfyClient(self.__api_key, {"lang": lang})
コード例 #3
0
def findentities(lines, lang, args, cache=None):
    """Find entities using BabelFy given a set of input lines"""
    babelfy_params = dict()
    babelfy_params['lang'] = lang.upper()
    if args.cands is not None:
        babelfy_params['cands'] = args.cands
    if args.anntype is not None:
        babelfy_params['annType'] = args.anntype
    if args.annres is not None:
        babelfy_params['annRes'] = args.annres
    if args.th is not None:
        babelfy_params['th'] = args.th
    if args.match is not None:
        babelfy_params['match'] = args.match
    if args.mcs is not None:
        babelfy_params['MCS'] = args.mcs
    if args.dens:
        babelfy_params['dens'] = "true"
    if args.extaida:
        babelfy_params['extAida'] = "true"
    if args.postag is not None:
        babelfy_params['posTag'] = args.postag
    babelclient = BabelfyClient(args.apikey, babelfy_params)
    for i, (text, firstlinenr, lastlinenr,
            offsetmap) in enumerate(gettextchunks(lines, maxchunksize=4096)):
        if args.dryrun:
            print("---\nCHUNK #" + str(i) +
                  ". Would run query for firstlinenr=" + str(firstlinenr) +
                  ", lastlinenr=" + str(lastlinenr),
                  " text=" + text,
                  file=sys.stderr)
            print("Offsetmap:", repr(offsetmap), file=sys.stderr)
        elif cache is not None and text in cache:
            entities = cache[text]
            print("chunk #" + str(i) + " -- retrieved from cache",
                  file=sys.stderr)
        else:
            print("chunk #" + str(i) + " -- querying BabelFy", file=sys.stderr)
            babelclient.babelfy(text)
            entities = babelclient.entities
            if cache is not None: cache[text] = entities  #put in cache
        if not args.dryrun:
            for j, entity in enumerate(resolveoverlap(entities, args.overlap)):
                try:
                    entity['linenr'], entity['offset'] = resolveoffset(
                        offsetmap, entity['start'], lines, entity)
                    if 'ignore' not in entity or not entity['ignore']:
                        yield entity
                except ValueError as e:
                    print("---\nCHUNK #" + str(i) + " ENTITY #" + str(j) +
                          ". Ran query for firstlinenr=" + str(firstlinenr) +
                          ", lastlinenr=" + str(lastlinenr),
                          " text=" + text,
                          file=sys.stderr)
                    print("Entity:", repr(entity), file=sys.stderr)
                    print("Offsetmap:", repr(offsetmap), file=sys.stderr)
                    raise e
コード例 #4
0
class BabelPyEntityLinking(EntityLinking):
    """
    Interface for the Babelpy library that wraps some feature of Babelfy entity Linking.

    Args:
        api_key: string obtained by registering to
            babelfy website, with None babelpy key only few
            queries can be executed
    """
    def __init__(self, api_key: str = None):
        super().__init__()
        self.__api_key = api_key
        self.__babel_client = None

    @FieldContentProductionTechnique.lang.setter
    def lang(self, lang: str):
        FieldContentProductionTechnique.lang.fset(self, lang)
        params = dict()
        params['lang'] = self.lang
        self.__babel_client = BabelfyClient(self.__api_key, params)

    def __str__(self):
        return "BabelPyEntityLinking"

    def produce_content(self, field_representation_name: str,
                        field_data) -> FeaturesBagField:
        """
        Produces the field content for this representation,
        bag of features whose keys is babel net synset id and
        values are global score of the sysnset

        Args:
            field_representation_name (str): Name of the field representation
            field_data: Text that will be linked to BabelNet

        Returns:
            feature_bag (FeaturesBagField)
        """
        field_data = check_not_tokenized(field_data)

        self.__babel_client.babelfy(field_data)
        feature_bag = FeaturesBagField(field_representation_name)
        try:
            if self.__babel_client.entities is not None:
                try:
                    for entity in self.__babel_client.entities:
                        feature_bag.append_feature(entity['babelSynsetID'],
                                                   entity['globalScore'])
                except AttributeError:
                    pass
        except AttributeError:
            pass

        return feature_bag
コード例 #5
0
class BabelPyEntityLinking(EntityLinking):
    """
    Interface for the Babelpy library that wraps some feature of Babelfy entity Linking.

    Args:
        api_key: string obtained by registering to babelfy website, with None babelpy key only few
            queries can be executed
    """

    def __init__(self, api_key: str = None):
        super().__init__()
        self.__api_key = api_key
        self.__babel_client = BabelfyClient(self.__api_key, {"lang": self.lang})

    def produce_single_repr(self, field_data: Union[List[str], str]) -> FeaturesBagField:
        """
        Produces a bag of features whose keys is babel net synset id and values are global score of the sysnset
        """
        field_data = check_not_tokenized(field_data)

        self.__babel_client.babelfy(field_data)
        feature_bag = {}
        try:
            if self.__babel_client.entities is not None:
                try:
                    for entity in self.__babel_client.entities:
                        feature_bag[entity['babelSynsetID']] = entity['globalScore']
                except AttributeError:
                    pass
        except AttributeError:
            pass

        return FeaturesBagField(feature_bag)

    def __str__(self):
        return "BabelPyEntityLinking"

    def __repr__(self):
        return "< BabelPyEntityLinking: babel client = " + str(self.__babel_client) + " >"
コード例 #6
0
ファイル: gsearch.py プロジェクト: VforV93/lookup-lucapp
def simplify_ques_fy(question):

    params = {'lang': 'IT', 'th': '.0', 'match': 'PARTIAL_MATCHING'}

    babel_client = BabelfyClient(os.environ['BABEL'], params)
    # Babelfy sentence.
    babel_client.babelfy(question)

    data = babelfyAPI(params)
    question = re.sub(no_end_char, '', question)
    splitted_question = question.split()
    simplfy_ques = []
    for result in data:
        for token in splitted_question[result['tokenFragment']['start']:
                                       result['tokenFragment']['end'] + 1]:
            if token not in simplfy_ques and token not in remove_words:
                simplfy_ques.append(token)

    #squestion = " ".join(simplfy_ques)
    #params['text'] = squestion
    params['match'] = 'PARTIAL_MATCHING'
    data = babelfyAPI(params)

    qwords = question.lower().split()
    # check if the question is a negative one
    neg = False
    for w in qwords:
        if w in negative_words:
            neg = True
            break

    for w in negative_words:
        if w in splitted_question:
            splitted_question.remove(w)

    rank_dict = {}
    simply_rank_dict = {}
    simply_rank_list = []
    check_synset = []
    senses = []

    for i, _ in enumerate(splitted_question):
        # if two subsequent words has the first letter uppercased they probably refers to a proper noun
        try:
            if splitted_question[i][0].isupper() and splitted_question[
                    i + 1][0].isupper():
                senses.append(splitted_question[i] + " " +
                              splitted_question[i + 1])
        except IndexError:
            pass

    for result in data:
        result['score'] += result['coherenceScore']
        result['score'] += result['globalScore']
        for i in range(result['tokenFragment']['start'],
                       result['tokenFragment']['end'] + 1):
            rank_dict.setdefault(i, []).append(result)

    for i in rank_dict:
        best = None
        for j, result in enumerate(rank_dict[i]):
            if best is None:
                best = result
            else:
                if result['score'] > best['score']:
                    best = result

        if best['babelSynsetID'] not in simply_rank_dict:
            simply_rank_dict[best['babelSynsetID']] = best
            check_synset.append(best['babelSynsetID'])
            els = " ".join(splitted_question[
                best['tokenFragment']['start']:best['tokenFragment']['end'] +
                1])
            if els not in senses and els not in remove_words:
                senses.append(els)

    for bid in check_synset:
        if bid[-1] == 'n':
            params = {
                'id': bid,
                'targetLang': 'IT',
                'key': os.environ['BABEL']
            }
            ris = babelAPI(params)
            if ris not in senses and ris not in remove_words:
                senses.append(ris)

    return ParsedQuestion(question, senses, simplfy_ques), neg
コード例 #7
0
ファイル: gsearch.py プロジェクト: VforV93/lookup-lucapp
# negative words
negative_words = json.loads(
    io.open("Data/settings.json", encoding="utf-8").read())["negative_words"]

# negative words
no_end_char = r'[!@#$?:,;]'

# wikipedia url
wiki_url = r'(https:\/\/|http:\/\/)([a-z]*.wikipedia.org)'

# No Score
NO_SCORE = 969696

params = {'lang': 'IT', 'th': '.0', 'match': 'PARTIAL_MATCHING'}
babel_client = BabelfyClient(os.environ['BABEL'], params)


class ParsedQuestion:
    """Holding some elements extracted from the question"""
    def __init__(self, originalq, proper_nouns, simplyfiedq):
        self.original = originalq
        self.proper_nouns = proper_nouns
        self.simplyfied = simplyfiedq

    def __str__(self):
        return "simplyfied:{}\nproper_nouns:{}".format(self.simplyfied,
                                                       self.proper_nouns)


class Searcher(object):
コード例 #8
0
 def __init__(self, api_key: str = None):
     super().__init__()
     self.__api_key = api_key
     self.__babel_client = BabelfyClient(self.__api_key, {"lang": self.lang})
コード例 #9
0
ファイル: test_babelfy.py プロジェクト: proycon/babelpy
from __future__ import print_function, unicode_literals, division, absolute_import

import sys
import unittest
import json
import os
from babelpy.babelfy import BabelfyClient
from babelpy.config.config import API_KEY, LANG
from babelpy.reader import read_txt_file

params = dict()
params['lang'] = LANG
bc = BabelfyClient(API_KEY, params)
txt = "BabelNet is both a multilingual encyclopedic dictionary and a semantic network."
entities = [{
    "tokenFragment": {
        "start": 0,
        "end": 0
    },
    "charFragment": {
        "start": 0,
        "end": 7
    },
    "babelSynsetID": "bn:03083790n",
    "DBpediaURL": "http://dbpedia.org/resource/BabelNet",
    "BabelNetURL": "http://babelnet.org/rdf/s03083790n",
    "score": 1.0,
    "coherenceScore": 0.6666666666666666,
    "globalScore": 0.11428571428571428,
    "source": "BABELFY"
}, {
コード例 #10
0
 def lang(self, lang: str):
     FieldContentProductionTechnique.lang.fset(self, lang)
     params = dict()
     params['lang'] = self.lang
     self.__babel_client = BabelfyClient(self.__api_key, params)
コード例 #11
0
class BabelPyEntityLinking(EntityLinking):
    """
    Interface for the Babelpy library that wraps some feature of Babelfy entity Linking.

    Args:
        api_key: string obtained by registering to babelfy website, with None babelpy key only few
            queries can be executed
    """
    def __init__(self,
                 field_to_link: str,
                 api_key: str = None,
                 lang: str = "EN"):
        super().__init__("all_retrieved")
        self.__field_to_link = field_to_link
        self.__api_key = api_key
        self.__babel_client = BabelfyClient(self.__api_key, {"lang": lang})

    def get_properties(self,
                       raw_source: RawInformationSource) -> List[EntitiesProp]:
        """
        Produces a list of EntitiesProp objects for every raw content in the raw source where .

        An Entity Prop object is basically a dict where the keys are the entity linked (since there can be multiple
        entities in a field) and values are properties retrieved from BabelPy for that entity.
        EXAMPLE:
            properties_list = [EntityProp(), EntityProp(), ...]

            EntityProp.value -> {'DiCaprio': {'babelSynsetID': ..., ...},'Nolan': {'babelSynsetID: ..., ...}, ...}

        """
        properties_list = []
        logger.info("Doing Entity Linking with BabelFy")
        for raw_content in progbar(raw_source,
                                   max_value=len(list(raw_source))):
            data_to_disambiguate = check_not_tokenized(
                raw_content[self.__field_to_link])

            self.__babel_client.babelfy(data_to_disambiguate)

            properties_content = {}
            try:
                if self.__babel_client.merged_entities is not None:

                    for entity in self.__babel_client.merged_entities:
                        properties_entity = {
                            'babelSynsetID': '',
                            'DBPediaURL': '',
                            'BabelNetURL': '',
                            'score': '',
                            'coherenceScore': '',
                            'globalScore': '',
                            'source': ''
                        }

                        for key in properties_entity:
                            if entity.get(key) is not None:
                                properties_entity[key] = entity[key]

                        properties_content[entity['text']] = properties_entity

                properties_list.append(EntitiesProp(properties_content))
            except AttributeError:
                raise AttributeError(
                    "BabelFy limit reached! Insert an api key or change it if you inserted one!"
                )

        return properties_list

    def __str__(self):
        return "BabelPyEntityLinking"

    def __repr__(self):
        return "< BabelPyEntityLinking: babel client = " + str(
            self.__babel_client) + " >"
コード例 #12
0
#from modules.pybabelfy.pybabelfy.babelfy import *
import os
from babelpy.babelfy import BabelfyClient

# Instantiate BabelFy client.
params = dict()
params['lang'] = 'IT'
params['th'] = '.0'
params['match'] = 'PARTIAL_MATCHING'
babel_client = BabelfyClient(os.environ['BABEL'], params)

# Babelfy sentence.
babel_client.babelfy("Chi tra i seguenti non e un nano di 'Biancaneve'")

# Get entity data.
print(babel_client.entities)
print('\n')
# Get entity and non-entity data.
print(babel_client.all_entities)
print('\n')
# Get merged entities only.
print(babel_client.merged_entities)
print('\n')
# Get all merged entities.
babel_client.all_merged_entities
"""
def frag(semantic_annotation, input_text):
    start = semantic_annotation.char_fragment_start()
    end = semantic_annotation.char_fragment_end()
    return input_text[start:end+1]
コード例 #13
0
ファイル: babelpy.py プロジェクト: proycon/babelpy
def main():
    global API_KEY

    # Parse the command-line arguments.
    args = parse()

    if not API_KEY:
        API_KEY = args.get('api_key')
        # Ensure input text is unicode.
        if (sys.version < '3' and isinstance(API_KEY, str)) or (
                sys.version > '3' and isinstance(API_KEY, bytes)):
            API_KEY = API_KEY.decode('utf-8')
        elif not API_KEY:
            print('BabelFy API key is required.', file=sys.stderr)
            sys.exit()

    # Get the input text from cmd-line or file.
    if args.get('text'):
        text = [args.get('text')]
    elif args.get('text_file'):
        filepath = args.get('text_file')
        try:
            text = read_txt_file(filepath)
        except Exception as e:
            print('failed to read text', file=sys.stderr)
            sys.exit()
    else:
        print('need text data to babelfy. see --help option for usage.',
              file=sys.stderr)
        sys.exit()

    # Split the text into sentences.
    text_list = list()
    for txt in text:
        sentence = txt.replace('\n', '').strip()
        if (sys.version < '3' and isinstance(sentence, str)) or (
                sys.version > '3' and isinstance(sentence, bytes)):
            sentence = sentence.decode('utf-8')
        text_list.append(sentence)

    try:
        if text[-1] == '.':
            text_list = text_list[:-1]
    except:
        pass

    # Instantiate BabelFy client.
    params = dict()
    params['lang'] = LANG
    babel_client = BabelfyClient(API_KEY, params)

    # Store parsed data.
    entities = list()
    all_entities = list()
    merged_entities = list()
    all_merged_entities = list()

    # Babelfy the the text, sentence by sentence.
    for sentence in text_list:
        # Babelfy sentence.
        try:
            babel_client.babelfy(sentence)
        except Exception as e:
            traceback.print_exc()

        # Get entity data.
        if args.get('entities'):
            entities.append(babel_client.entities)

        # Get entity and non-entity data.
        if args.get('all_entities'):
            all_entities.append(babel_client.all_entities)

        # Get merged entities only.
        if args.get('merged_entities'):
            merged_entities.append(babel_client.merged_entities)

        # Get all merged entities.
        if args.get('all_merged_entities'):
            all_merged_entities.append(babel_client.all_merged_entities)

    # Export to file.
    if args.get('export'):
        from babelpy.dump import dump_json

        # Get the filename from cmd-line args.
        dumppath = args.get('export')

        # Ensure filename is unicode
        if (sys.version < '3' and isinstance(dumppath, str)) or (
                sys.version > '3' and isinstance(dumppath, bytes)):
            dumppath = dumppath.decode('utf-8')

        dumppath = dumppath + '.json' if not dumppath.endswith('.json') \
            else dumppath

        output_data = dict()

        if args.get('entities'):
            output_data['entities'] = entities

        if args.get('all_entities'):
            output_data['all_entities'] = all_entities

        if args.get('merged_entities'):
            output_data['merged_entities'] = merged_entities

        if args.get('all_merged_entities'):
            output_data['all_merged_entities'] = all_merged_entities

        try:
            dump_json(output_data, dumppath)
        except Exception as e:
            print('failed to write file', file=sys.stderr)
            traceback.print_exc()

    # Print to stdout.
    if args.get('print'):

        if args.get('entities'):
            print('\nENTITIES')
            for token in entities:
                pprint(token)

        if args.get('all_entities'):
            print('\nALL ENTITIES')
            for token in all_entities:
                pprint(token)

        if args.get('merged_entities'):
            print('\nMERGED ENTITIES')
            for token in merged_entities:
                pprint(token)

        if args.get('all_merged_entities'):
            print('\nALL MERGED ENTITIES')
            for token in all_merged_entities:
                pprint(token)
コード例 #14
0
 def set_lang(self, lang: str):
     super().set_lang(lang)
     params = dict()
     params['lang'] = self.get_lang()
     self.__babel_client = BabelfyClient(self.__api_key, params)
コード例 #15
0
ファイル: babelpy.py プロジェクト: fgrimme/babelpy
def main():
    global API_KEY

    # Parse the command-line arguments.
    args = parse()

    if not API_KEY:
        API_KEY = args.get('api_key')
        # Ensure input text is unicode.
        if (sys.version < '3' and isinstance(API_KEY, str)) or (sys.version > '3' and isinstance(API_KEY, bytes)):
            API_KEY = API_KEY.decode('utf-8')
        elif not API_KEY:
            print('BabelFy API key is required.', file=sys.stderr)
            sys.exit()

    # Get the input text from cmd-line or file.
    if args.get('text'):
        text = [args.get('text')]
    elif args.get('text_file'):
        filepath = args.get('text_file')
        try:
            text = read_txt_file(filepath)
        except Exception as e:
            print('failed to read text', file=sys.stderr)
            sys.exit()
    else:
        print('need text data to babelfy. see --help option for usage.', file=sys.stderr)
        sys.exit()

    # Split the text into sentences.
    text_list = list()
    for txt in text:
        sentence = txt.replace('\n', '').strip()
        if (sys.version < '3' and isinstance(sentence, str)) or (sys.version > '3' and isinstance(sentence, bytes)):
            sentence = sentence.decode('utf-8')
        text_list.append(sentence)

    try:
        if text[-1] == '.':
            text_list = text_list[:-1]
    except:
        pass

    # Instantiate BabelFy client.
    params = dict()
    params['lang'] = LANG
    babel_client = BabelfyClient(API_KEY, params)

    # Store parsed data.
    entities = list()
    all_entities = list()
    merged_entities = list()
    all_merged_entities = list()

    # Babelfy the the text, sentence by sentence.
    for sentence in text_list:
        # Babelfy sentence.
        try:
            babel_client.babelfy(sentence)
        except Exception as e:
            traceback.print_exc()

        # Get entity data.
        if args.get('entities'):
            entities.append(babel_client.entities)

        # Get entity and non-entity data.
        if args.get('all_entities'):
            all_entities.append(babel_client.all_entities)

        # Get merged entities only.
        if args.get('merged_entities'):
            merged_entities.append(babel_client.merged_entities)

        # Get all merged entities.
        if args.get('all_merged_entities'):
            all_merged_entities.append(babel_client.all_merged_entities)

    # Export to file.
    if args.get('export'):
        from babelpy.dump import dump_json

        # Get the filename from cmd-line args.
        dumppath = args.get('export')

        # Ensure filename is unicode
        if (sys.version < '3' and isinstance(dumppath, str)) or (sys.version > '3' and isinstance(dumppath, bytes)):
            dumppath = dumppath.decode('utf-8')

        dumppath = dumppath + '.json' if not dumppath.endswith('.json') \
            else dumppath

        output_data = dict()

        if args.get('entities'):
            output_data['entities'] = entities

        if args.get('all_entities'):
            output_data['all_entities'] = all_entities

        if args.get('merged_entities'):
            output_data['merged_entities'] = merged_entities

        if args.get('all_merged_entities'):
            output_data['all_merged_entities'] = all_merged_entities

        try:
            dump_json(output_data, dumppath)
        except Exception as e:
            print('failed to write file',file=sys.stderr)
            traceback.print_exc()

    # Print to stdout.
    if args.get('print'):

        if args.get('entities'):
            print('\nENTITIES')
            for token in entities:
                pprint(token)

        if args.get('all_entities'):
            print('\nALL ENTITIES')
            for token in all_entities:
                pprint(token)

        if args.get('merged_entities'):
            print('\nMERGED ENTITIES')
            for token in merged_entities:
                pprint(token)

        if args.get('all_merged_entities'):
            print('\nALL MERGED ENTITIES')
            for token in all_merged_entities:
                pprint(token)