def babelfy(text, key): params = {"lang": "en"} babel_client = BabelfyClient(key, params) babel_client.babelfy(text) synsets = [] entities = [] for entity in babel_client.merged_entities: token_start = entity["tokenFragment"]["start"] token_end = entity["tokenFragment"]["end"] synset = entity["babelSynsetID"] lemma = entity["text"] synsets.append({ 'token_start': token_start, 'token_end': token_end, 'synset': synset, 'lemma': lemma }) entities.append({ 'token_start': token_start, 'token_end': token_end, 'entity': synset, 'lemma': lemma }) return {'synsets': synsets, 'entities': entities}
def __init__(self, field_to_link: str, api_key: str = None, lang: str = "EN"): super().__init__("all_retrieved") self.__field_to_link = field_to_link self.__api_key = api_key self.__babel_client = BabelfyClient(self.__api_key, {"lang": lang})
def findentities(lines, lang, args, cache=None): """Find entities using BabelFy given a set of input lines""" babelfy_params = dict() babelfy_params['lang'] = lang.upper() if args.cands is not None: babelfy_params['cands'] = args.cands if args.anntype is not None: babelfy_params['annType'] = args.anntype if args.annres is not None: babelfy_params['annRes'] = args.annres if args.th is not None: babelfy_params['th'] = args.th if args.match is not None: babelfy_params['match'] = args.match if args.mcs is not None: babelfy_params['MCS'] = args.mcs if args.dens: babelfy_params['dens'] = "true" if args.extaida: babelfy_params['extAida'] = "true" if args.postag is not None: babelfy_params['posTag'] = args.postag babelclient = BabelfyClient(args.apikey, babelfy_params) for i, (text, firstlinenr, lastlinenr, offsetmap) in enumerate(gettextchunks(lines, maxchunksize=4096)): if args.dryrun: print("---\nCHUNK #" + str(i) + ". Would run query for firstlinenr=" + str(firstlinenr) + ", lastlinenr=" + str(lastlinenr), " text=" + text, file=sys.stderr) print("Offsetmap:", repr(offsetmap), file=sys.stderr) elif cache is not None and text in cache: entities = cache[text] print("chunk #" + str(i) + " -- retrieved from cache", file=sys.stderr) else: print("chunk #" + str(i) + " -- querying BabelFy", file=sys.stderr) babelclient.babelfy(text) entities = babelclient.entities if cache is not None: cache[text] = entities #put in cache if not args.dryrun: for j, entity in enumerate(resolveoverlap(entities, args.overlap)): try: entity['linenr'], entity['offset'] = resolveoffset( offsetmap, entity['start'], lines, entity) if 'ignore' not in entity or not entity['ignore']: yield entity except ValueError as e: print("---\nCHUNK #" + str(i) + " ENTITY #" + str(j) + ". Ran query for firstlinenr=" + str(firstlinenr) + ", lastlinenr=" + str(lastlinenr), " text=" + text, file=sys.stderr) print("Entity:", repr(entity), file=sys.stderr) print("Offsetmap:", repr(offsetmap), file=sys.stderr) raise e
class BabelPyEntityLinking(EntityLinking): """ Interface for the Babelpy library that wraps some feature of Babelfy entity Linking. Args: api_key: string obtained by registering to babelfy website, with None babelpy key only few queries can be executed """ def __init__(self, api_key: str = None): super().__init__() self.__api_key = api_key self.__babel_client = None @FieldContentProductionTechnique.lang.setter def lang(self, lang: str): FieldContentProductionTechnique.lang.fset(self, lang) params = dict() params['lang'] = self.lang self.__babel_client = BabelfyClient(self.__api_key, params) def __str__(self): return "BabelPyEntityLinking" def produce_content(self, field_representation_name: str, field_data) -> FeaturesBagField: """ Produces the field content for this representation, bag of features whose keys is babel net synset id and values are global score of the sysnset Args: field_representation_name (str): Name of the field representation field_data: Text that will be linked to BabelNet Returns: feature_bag (FeaturesBagField) """ field_data = check_not_tokenized(field_data) self.__babel_client.babelfy(field_data) feature_bag = FeaturesBagField(field_representation_name) try: if self.__babel_client.entities is not None: try: for entity in self.__babel_client.entities: feature_bag.append_feature(entity['babelSynsetID'], entity['globalScore']) except AttributeError: pass except AttributeError: pass return feature_bag
class BabelPyEntityLinking(EntityLinking): """ Interface for the Babelpy library that wraps some feature of Babelfy entity Linking. Args: api_key: string obtained by registering to babelfy website, with None babelpy key only few queries can be executed """ def __init__(self, api_key: str = None): super().__init__() self.__api_key = api_key self.__babel_client = BabelfyClient(self.__api_key, {"lang": self.lang}) def produce_single_repr(self, field_data: Union[List[str], str]) -> FeaturesBagField: """ Produces a bag of features whose keys is babel net synset id and values are global score of the sysnset """ field_data = check_not_tokenized(field_data) self.__babel_client.babelfy(field_data) feature_bag = {} try: if self.__babel_client.entities is not None: try: for entity in self.__babel_client.entities: feature_bag[entity['babelSynsetID']] = entity['globalScore'] except AttributeError: pass except AttributeError: pass return FeaturesBagField(feature_bag) def __str__(self): return "BabelPyEntityLinking" def __repr__(self): return "< BabelPyEntityLinking: babel client = " + str(self.__babel_client) + " >"
def simplify_ques_fy(question): params = {'lang': 'IT', 'th': '.0', 'match': 'PARTIAL_MATCHING'} babel_client = BabelfyClient(os.environ['BABEL'], params) # Babelfy sentence. babel_client.babelfy(question) data = babelfyAPI(params) question = re.sub(no_end_char, '', question) splitted_question = question.split() simplfy_ques = [] for result in data: for token in splitted_question[result['tokenFragment']['start']: result['tokenFragment']['end'] + 1]: if token not in simplfy_ques and token not in remove_words: simplfy_ques.append(token) #squestion = " ".join(simplfy_ques) #params['text'] = squestion params['match'] = 'PARTIAL_MATCHING' data = babelfyAPI(params) qwords = question.lower().split() # check if the question is a negative one neg = False for w in qwords: if w in negative_words: neg = True break for w in negative_words: if w in splitted_question: splitted_question.remove(w) rank_dict = {} simply_rank_dict = {} simply_rank_list = [] check_synset = [] senses = [] for i, _ in enumerate(splitted_question): # if two subsequent words has the first letter uppercased they probably refers to a proper noun try: if splitted_question[i][0].isupper() and splitted_question[ i + 1][0].isupper(): senses.append(splitted_question[i] + " " + splitted_question[i + 1]) except IndexError: pass for result in data: result['score'] += result['coherenceScore'] result['score'] += result['globalScore'] for i in range(result['tokenFragment']['start'], result['tokenFragment']['end'] + 1): rank_dict.setdefault(i, []).append(result) for i in rank_dict: best = None for j, result in enumerate(rank_dict[i]): if best is None: best = result else: if result['score'] > best['score']: best = result if best['babelSynsetID'] not in simply_rank_dict: simply_rank_dict[best['babelSynsetID']] = best check_synset.append(best['babelSynsetID']) els = " ".join(splitted_question[ best['tokenFragment']['start']:best['tokenFragment']['end'] + 1]) if els not in senses and els not in remove_words: senses.append(els) for bid in check_synset: if bid[-1] == 'n': params = { 'id': bid, 'targetLang': 'IT', 'key': os.environ['BABEL'] } ris = babelAPI(params) if ris not in senses and ris not in remove_words: senses.append(ris) return ParsedQuestion(question, senses, simplfy_ques), neg
# negative words negative_words = json.loads( io.open("Data/settings.json", encoding="utf-8").read())["negative_words"] # negative words no_end_char = r'[!@#$?:,;]' # wikipedia url wiki_url = r'(https:\/\/|http:\/\/)([a-z]*.wikipedia.org)' # No Score NO_SCORE = 969696 params = {'lang': 'IT', 'th': '.0', 'match': 'PARTIAL_MATCHING'} babel_client = BabelfyClient(os.environ['BABEL'], params) class ParsedQuestion: """Holding some elements extracted from the question""" def __init__(self, originalq, proper_nouns, simplyfiedq): self.original = originalq self.proper_nouns = proper_nouns self.simplyfied = simplyfiedq def __str__(self): return "simplyfied:{}\nproper_nouns:{}".format(self.simplyfied, self.proper_nouns) class Searcher(object):
def __init__(self, api_key: str = None): super().__init__() self.__api_key = api_key self.__babel_client = BabelfyClient(self.__api_key, {"lang": self.lang})
from __future__ import print_function, unicode_literals, division, absolute_import import sys import unittest import json import os from babelpy.babelfy import BabelfyClient from babelpy.config.config import API_KEY, LANG from babelpy.reader import read_txt_file params = dict() params['lang'] = LANG bc = BabelfyClient(API_KEY, params) txt = "BabelNet is both a multilingual encyclopedic dictionary and a semantic network." entities = [{ "tokenFragment": { "start": 0, "end": 0 }, "charFragment": { "start": 0, "end": 7 }, "babelSynsetID": "bn:03083790n", "DBpediaURL": "http://dbpedia.org/resource/BabelNet", "BabelNetURL": "http://babelnet.org/rdf/s03083790n", "score": 1.0, "coherenceScore": 0.6666666666666666, "globalScore": 0.11428571428571428, "source": "BABELFY" }, {
def lang(self, lang: str): FieldContentProductionTechnique.lang.fset(self, lang) params = dict() params['lang'] = self.lang self.__babel_client = BabelfyClient(self.__api_key, params)
class BabelPyEntityLinking(EntityLinking): """ Interface for the Babelpy library that wraps some feature of Babelfy entity Linking. Args: api_key: string obtained by registering to babelfy website, with None babelpy key only few queries can be executed """ def __init__(self, field_to_link: str, api_key: str = None, lang: str = "EN"): super().__init__("all_retrieved") self.__field_to_link = field_to_link self.__api_key = api_key self.__babel_client = BabelfyClient(self.__api_key, {"lang": lang}) def get_properties(self, raw_source: RawInformationSource) -> List[EntitiesProp]: """ Produces a list of EntitiesProp objects for every raw content in the raw source where . An Entity Prop object is basically a dict where the keys are the entity linked (since there can be multiple entities in a field) and values are properties retrieved from BabelPy for that entity. EXAMPLE: properties_list = [EntityProp(), EntityProp(), ...] EntityProp.value -> {'DiCaprio': {'babelSynsetID': ..., ...},'Nolan': {'babelSynsetID: ..., ...}, ...} """ properties_list = [] logger.info("Doing Entity Linking with BabelFy") for raw_content in progbar(raw_source, max_value=len(list(raw_source))): data_to_disambiguate = check_not_tokenized( raw_content[self.__field_to_link]) self.__babel_client.babelfy(data_to_disambiguate) properties_content = {} try: if self.__babel_client.merged_entities is not None: for entity in self.__babel_client.merged_entities: properties_entity = { 'babelSynsetID': '', 'DBPediaURL': '', 'BabelNetURL': '', 'score': '', 'coherenceScore': '', 'globalScore': '', 'source': '' } for key in properties_entity: if entity.get(key) is not None: properties_entity[key] = entity[key] properties_content[entity['text']] = properties_entity properties_list.append(EntitiesProp(properties_content)) except AttributeError: raise AttributeError( "BabelFy limit reached! Insert an api key or change it if you inserted one!" ) return properties_list def __str__(self): return "BabelPyEntityLinking" def __repr__(self): return "< BabelPyEntityLinking: babel client = " + str( self.__babel_client) + " >"
#from modules.pybabelfy.pybabelfy.babelfy import * import os from babelpy.babelfy import BabelfyClient # Instantiate BabelFy client. params = dict() params['lang'] = 'IT' params['th'] = '.0' params['match'] = 'PARTIAL_MATCHING' babel_client = BabelfyClient(os.environ['BABEL'], params) # Babelfy sentence. babel_client.babelfy("Chi tra i seguenti non e un nano di 'Biancaneve'") # Get entity data. print(babel_client.entities) print('\n') # Get entity and non-entity data. print(babel_client.all_entities) print('\n') # Get merged entities only. print(babel_client.merged_entities) print('\n') # Get all merged entities. babel_client.all_merged_entities """ def frag(semantic_annotation, input_text): start = semantic_annotation.char_fragment_start() end = semantic_annotation.char_fragment_end() return input_text[start:end+1]
def main(): global API_KEY # Parse the command-line arguments. args = parse() if not API_KEY: API_KEY = args.get('api_key') # Ensure input text is unicode. if (sys.version < '3' and isinstance(API_KEY, str)) or ( sys.version > '3' and isinstance(API_KEY, bytes)): API_KEY = API_KEY.decode('utf-8') elif not API_KEY: print('BabelFy API key is required.', file=sys.stderr) sys.exit() # Get the input text from cmd-line or file. if args.get('text'): text = [args.get('text')] elif args.get('text_file'): filepath = args.get('text_file') try: text = read_txt_file(filepath) except Exception as e: print('failed to read text', file=sys.stderr) sys.exit() else: print('need text data to babelfy. see --help option for usage.', file=sys.stderr) sys.exit() # Split the text into sentences. text_list = list() for txt in text: sentence = txt.replace('\n', '').strip() if (sys.version < '3' and isinstance(sentence, str)) or ( sys.version > '3' and isinstance(sentence, bytes)): sentence = sentence.decode('utf-8') text_list.append(sentence) try: if text[-1] == '.': text_list = text_list[:-1] except: pass # Instantiate BabelFy client. params = dict() params['lang'] = LANG babel_client = BabelfyClient(API_KEY, params) # Store parsed data. entities = list() all_entities = list() merged_entities = list() all_merged_entities = list() # Babelfy the the text, sentence by sentence. for sentence in text_list: # Babelfy sentence. try: babel_client.babelfy(sentence) except Exception as e: traceback.print_exc() # Get entity data. if args.get('entities'): entities.append(babel_client.entities) # Get entity and non-entity data. if args.get('all_entities'): all_entities.append(babel_client.all_entities) # Get merged entities only. if args.get('merged_entities'): merged_entities.append(babel_client.merged_entities) # Get all merged entities. if args.get('all_merged_entities'): all_merged_entities.append(babel_client.all_merged_entities) # Export to file. if args.get('export'): from babelpy.dump import dump_json # Get the filename from cmd-line args. dumppath = args.get('export') # Ensure filename is unicode if (sys.version < '3' and isinstance(dumppath, str)) or ( sys.version > '3' and isinstance(dumppath, bytes)): dumppath = dumppath.decode('utf-8') dumppath = dumppath + '.json' if not dumppath.endswith('.json') \ else dumppath output_data = dict() if args.get('entities'): output_data['entities'] = entities if args.get('all_entities'): output_data['all_entities'] = all_entities if args.get('merged_entities'): output_data['merged_entities'] = merged_entities if args.get('all_merged_entities'): output_data['all_merged_entities'] = all_merged_entities try: dump_json(output_data, dumppath) except Exception as e: print('failed to write file', file=sys.stderr) traceback.print_exc() # Print to stdout. if args.get('print'): if args.get('entities'): print('\nENTITIES') for token in entities: pprint(token) if args.get('all_entities'): print('\nALL ENTITIES') for token in all_entities: pprint(token) if args.get('merged_entities'): print('\nMERGED ENTITIES') for token in merged_entities: pprint(token) if args.get('all_merged_entities'): print('\nALL MERGED ENTITIES') for token in all_merged_entities: pprint(token)
def set_lang(self, lang: str): super().set_lang(lang) params = dict() params['lang'] = self.get_lang() self.__babel_client = BabelfyClient(self.__api_key, params)
def main(): global API_KEY # Parse the command-line arguments. args = parse() if not API_KEY: API_KEY = args.get('api_key') # Ensure input text is unicode. if (sys.version < '3' and isinstance(API_KEY, str)) or (sys.version > '3' and isinstance(API_KEY, bytes)): API_KEY = API_KEY.decode('utf-8') elif not API_KEY: print('BabelFy API key is required.', file=sys.stderr) sys.exit() # Get the input text from cmd-line or file. if args.get('text'): text = [args.get('text')] elif args.get('text_file'): filepath = args.get('text_file') try: text = read_txt_file(filepath) except Exception as e: print('failed to read text', file=sys.stderr) sys.exit() else: print('need text data to babelfy. see --help option for usage.', file=sys.stderr) sys.exit() # Split the text into sentences. text_list = list() for txt in text: sentence = txt.replace('\n', '').strip() if (sys.version < '3' and isinstance(sentence, str)) or (sys.version > '3' and isinstance(sentence, bytes)): sentence = sentence.decode('utf-8') text_list.append(sentence) try: if text[-1] == '.': text_list = text_list[:-1] except: pass # Instantiate BabelFy client. params = dict() params['lang'] = LANG babel_client = BabelfyClient(API_KEY, params) # Store parsed data. entities = list() all_entities = list() merged_entities = list() all_merged_entities = list() # Babelfy the the text, sentence by sentence. for sentence in text_list: # Babelfy sentence. try: babel_client.babelfy(sentence) except Exception as e: traceback.print_exc() # Get entity data. if args.get('entities'): entities.append(babel_client.entities) # Get entity and non-entity data. if args.get('all_entities'): all_entities.append(babel_client.all_entities) # Get merged entities only. if args.get('merged_entities'): merged_entities.append(babel_client.merged_entities) # Get all merged entities. if args.get('all_merged_entities'): all_merged_entities.append(babel_client.all_merged_entities) # Export to file. if args.get('export'): from babelpy.dump import dump_json # Get the filename from cmd-line args. dumppath = args.get('export') # Ensure filename is unicode if (sys.version < '3' and isinstance(dumppath, str)) or (sys.version > '3' and isinstance(dumppath, bytes)): dumppath = dumppath.decode('utf-8') dumppath = dumppath + '.json' if not dumppath.endswith('.json') \ else dumppath output_data = dict() if args.get('entities'): output_data['entities'] = entities if args.get('all_entities'): output_data['all_entities'] = all_entities if args.get('merged_entities'): output_data['merged_entities'] = merged_entities if args.get('all_merged_entities'): output_data['all_merged_entities'] = all_merged_entities try: dump_json(output_data, dumppath) except Exception as e: print('failed to write file',file=sys.stderr) traceback.print_exc() # Print to stdout. if args.get('print'): if args.get('entities'): print('\nENTITIES') for token in entities: pprint(token) if args.get('all_entities'): print('\nALL ENTITIES') for token in all_entities: pprint(token) if args.get('merged_entities'): print('\nMERGED ENTITIES') for token in merged_entities: pprint(token) if args.get('all_merged_entities'): print('\nALL MERGED ENTITIES') for token in all_merged_entities: pprint(token)