def compute_mean(text, list): ann_dict = {} annotations = tagme.annotate(text) # print(annotations.get_annotations()) for ann in annotations.get_annotations(0.3): ann_dict[ann.entity_id] = ann.score # print ann # print "------------" final_ann = [] if len(ann_dict) > 1: if len(ann_dict) > 10: ordered = sorted(ann_dict.items(), key=lambda x: x[1], reverse=True) top_ann = ordered[:10] final_ann = [x[0] for x in top_ann] else: final_ann = [x for x in ann_dict] media_temp = [] for i in range(len(final_ann)): interest = final_ann[i] for j in range(i + 1, len(final_ann)): keyword = final_ann[j] if interest != keyword: rels = tagme.relatedness_wid((interest, keyword)) #print "relatedness is:" #print rels.relatedness[0].rel media_temp.append(rels.relatedness[0].rel) media = numpy.mean(media_temp) list.append(media) print media else: print "solo un'annotazione trovata per il testo"
def process_text_append_text_annotations(input_text: str): # Find annotations in a text annotations = tagme.annotate(input_text, GCUBE_TOKEN) entities = " ".join( [word.entity_title for word in annotations.get_annotations(0.2)]) # Convert characters to lower case input_text_to_lower = (input_text + " " + entities).lower() # Remove special characters from the string input_text_to_lower = re.sub('[^a-zA-Z0-9 \n]', '', input_text_to_lower) # Remove common words using list of stop words filtered_words_list = [ word for word in input_text_to_lower.split() if word not in Ranking.stop_words ] # Stem the list of words filtered_words_list = [stem(word) for word in filtered_words_list] # Word ranking ranked_dict = dict() for word in filtered_words_list: if word in ranked_dict: ranked_dict[word] += 1 else: ranked_dict[word] = 1 return ranked_dict
def tag_text(text): while True: try: annotations = tagme.annotate(text) break except: (text + "Again!") pass entities_list = set() # Print annotations with a score higher than 0.1 if annotations: for ann in annotations.get_annotations(0.1): # begin = int(ann.begin) # end = int(ann.end) # score = float(ann.score) entity_title = str(ann.entity_title) # if entity_title not in Entities: # print("missing: " + entity_title) # entity_title = entity_title.strip().replace(" ", "_").replace(";", "-COLON-").replace("(", "-LRB-").replace(")", "-RRB-") entity_title = re.sub("\s+", " ", entity_title) # entities_list.append([begin, end, entity_title, float(score)]) entities_list.add(entity_title) entities_list = list(entities_list) return entities_list
def run(self): global queue while queue.qsize() > 0: # Killed if self._stop_event.is_set(): break # Extract entities from sentences in queue sentence_meta = queue.get() try: if 'entities' not in sentence_meta: sentence_annotations = tagme.annotate( sentence_meta['sentence']) entities = [{'pos_begin': ann.begin, 'pos_end': ann.end, 'entity_id': ann.entity_id, 'score': ann.score} for ann in sentence_annotations.annotations] sentence_meta['entities'] = entities logger.info( '{}, worker: {}, jobs remain: {}.'.format(datetime.now(), self._index, queue.qsize())) except Exception as e: logger.warning(e) # Send job back to queue queue.put(sentence_meta) logger.info('Worker {} exited.'.format(self._index))
def retrieveTopics(text, topics): try: annotations = tagme.annotate(text) for ann in annotations.get_annotations(0.3): topics.setdefault(ann.entity_title, []).append(ann.score) except (Timeout, ConnectionError) as exc: print "errore di connessione"
def string2ent(self, string): tokens = self.tokenizer(string) print(string) bpe_tokens = sum([self._bpe(t) for t in tokens], tuple()) bpe_tokens = ' '.join(bpe_tokens).replace(BPEVocab.we, '') #using tagme ann = tagme.annotate(bpe_tokens) ents = [] for a in ann.get_annotations(0.1): if a.entity_title not in ent_map: continue print(a) ents.append([ent_map[a.entity_title], a.begin, a.end, a.score]) ent_str = self.split_str(bpe_tokens, ents) #using spacy doc = nlp(string) ents = [] ann = [X.text for X in doc.ents] print(ann) for a in ann: if a not in ent_map: continue ents.append(ent_map[a]) ''' ent_str = self.spacy_str(bpe_tokens, ents) ''' return ent_str
def get_annotation(query, th): lunch_annotations = tagme.annotate(query) res = [] # Print annotations with a score higher than 0.1 for ann in lunch_annotations.get_annotations(th): res.append(ann.entity_title) return ' '.join(res)
def getEntities(wordList): dictionary = {} annotationsText = (" ").join(wordList) print((str(annotationsText))) annotation = tagme.annotate((annotationsText)) for ann in annotation.get_annotations(0.08): dictionary[ann.entity_title] = ann.score return dictionary
def get_annotation_of_noCleanedFile(user_id, score, dirX): lunch_annotations = tagme.annotate( take_text_by_id_of_noCleanedFile(user_id, dirX)) # Print annotations with a score higher than 0.2 dictionary = {} for ann in lunch_annotations.get_annotations(score): dictionary[ann.entity_title] = ann.score return dictionary
def retrieve_titles_w_tag_me(self, question, tagme_api_key): import tagme tagme.GCUBE_TOKEN = tagme_api_key q_annotations = tagme.annotate(question) tagged_titles = [] for ann in q_annotations.get_annotations(0.1): tagged_titles.append(ann.entity_title) return tagged_titles
def disambig(text, min_rho=None): annotations = tagme.annotate(text) a = dict() for x in annotations.annotations: if min_rho is None or x.score > min_rho: a[str(x.mention)] = x.entity_title return a
def get_links(self, text: str) -> List[Pair]: annotations = tagme.annotate(text) return [ Pair( f'http://dbpedia.org/resource/{ann.entity_title.replace(" ", "_")}', ann.mention, 'entity') for ann in annotations.get_annotations(0.1) ]
def tagme(text) -> list: """ :text: --> The text in english after translation :return: --> return a list of entities """ if text == "": return [] tagme.GCUBE_TOKEN = VishvasnewsFactCheckingSiteExtractor.TAGME_API_KEY return tagme.annotate(text)
def QE_LM_Dirichlet(queries, paragraphs, en=False): # pseudo relevance matrix result = np.zeros((len(paragraphs), len(queries)), dtype=np.float) mu = 1500 collection = [w for (pid, para) in paragraphs for w in para.split(" ")] count_w_C = {} sum_w_D = {} for q_index, query in enumerate(queries): print("query: %s with %s words.." % (q_index, len(query.split(" ")))) cwD = np.zeros((len(paragraphs), len(query.split(" "))), dtype=np.float) for no, q in enumerate(query.split(" ")): count_w_D = {} for p_index, (pid, para) in enumerate(paragraphs): count_w_D[pid] = para.split(" ").count(q) cwD[p_index, no] = count_w_D.get(pid) if q in count_w_C.keys(): continue count_w_C[q] = count_w_D sum_w_D[q] = sum(count_w_D.values()) # Smoothing & i.i.d Language Model print(((cwD + mu * collection.count(q)) / (sum(count_w_D.values()) + mu)).shape) result[:, q_index] = np.prod( (cwD + mu * collection.count(q)) / (sum(count_w_D.values()) + mu), axis=1) if en == False: for q_index in range(result.shape[1]): print("expanding:%s" % q_index) pseudo_relevance = result[:, q_index] top_document_index = list(pseudo_relevance.argsort()[0:10]) for index in top_document_index: print("expanding:%s; top document: %s" % (q_index, index)) words = dict( collections.Counter(paragraphs[index][1].split(" "))) words = sorted(words.items(), key=lambda item: item[1], reverse=True) queries[q_index] = queries[q_index] + " " + words[0][0] else: for q_index in range(result.shape[1]): pseudo_relevance = result[:, q_index] top_document_index = list(pseudo_relevance.argsort()) count = 0 for index in top_document_index: if count >= 10: break annotations = tagme.annotate(paragraphs[index][1]) # annotations with a score higher than 0.1 for ann in annotations.get_annotations(0.1): queries[ q_index] = queries[q_index] + " " + ann.entity_title count += 1 return result, queries
def extract_entities(self, question, threshold=0.1): if type(question) == list: return [self.extract_entities(_question) for _question in question] annotations = tagme.annotate(question) return [{ 'entity': ann.entity_title, 'entity_id': ann.entity_id, 'mention': ann.mention, 'score': ann.score } for ann in sorted(annotations.get_annotations(threshold), key=lambda x: -x.score)]
def entity_list(request): import tagme # Set the authorization token for subsequent calls. tagme.GCUBE_TOKEN = "a5a377c1-1bd0-47b9-907a-75b1cdacb1d9-843339462" db_post = PostTitle.objects.filter(user_id=request.user.id, search_date_interval=val2(), search_word=val(), date=date.today().strftime("%b-%d-%Y")) file1 = db_post.values_list('title', flat=True) # file1 = open('file.txt', 'r', encoding='utf-8') entity_title_list = [] # Using for loop for line in file1: lunch_annotations = tagme.annotate(line) # Print annotations with a score higher than 0.1 for ann in lunch_annotations.get_annotations(min_rho=0.35): entity_title_list.append(ann.entity_title) # Closing files # file1.close() entity_title_list = list(dict.fromkeys(entity_title_list)) relation_degree_list = [] relation_pair_list = [] relation_list = [ "ALL RELATIONS BETWEEN ENTITIES ARE LISTED BELOW", "\n", "\n" ] for i in range(0, len(entity_title_list)): for j in range(i + 1, len(entity_title_list)): rels = tagme.relatedness_title( (entity_title_list[i], entity_title_list[j])) relation_degree = rels.relatedness[0].rel # print(f"{entity_title_list[i]} ve {entity_title_list[j]} {relation_degree}") # print("\n") if relation_degree != 0.0: relation_pair_list.append( (entity_title_list[i], entity_title_list[j])) relation_degree_list.append(relation_degree) pairs = entity_title_list[i] + " " + entity_title_list[j] relation_list.append(pairs) relation_list.append("\t") relation_list.append(relation_degree) relation_list.append("\n") print(entity_title_list[i]) print("Kalan entity sayısı: ", len(entity_title_list) - (i + 1)) print("done") return HttpResponse(relation_list, content_type="text/plain")
def tagme_result(request): import tagme # Set the authorization token for subsequent calls. tagme.GCUBE_TOKEN = "a5a377c1-1bd0-47b9-907a-75b1cdacb1d9-843339462" ''' with open('file.txt', 'r') as f: first_line = f.readline() lunch_annotations = tagme.annotate(first_line) ann_list = [] # Print annotations with a score higher than 0.1 for ann in lunch_annotations.get_annotations(0.1): ann_list.append(ann) ''' # file1 = open('file.txt', 'r', encoding='utf-8') count = 0 ann_list = ["Entity Results", "\n", "\n"] entity_title_list = [] searched_word = val() selected_date = val2() try: db_post = PostTitle.objects.filter( user_id=request.user.id, search_date_interval=selected_date, search_word=searched_word, date=date.today().strftime("%b-%d-%Y")) file1 = db_post.values_list('title', flat=True) # Using for loop for line in file1: count += 1 lunch_annotations = tagme.annotate(line) ann_list.append(count) ann_list.append("\n") # Print annotations with a score higher than 0.1 for ann in lunch_annotations.get_annotations(min_rho=0.35): ann_list.append(ann) ann_list.append("\t") ann_list.append(ann.uri()) ann_list.append("\n") entity_title_list.append(ann.entity_title) return HttpResponse(ann_list, content_type="text/plain") except AttributeError: error_message = "Something is wrong. Please contact to administrator or send an email to '*****@*****.**'. Thank you!" return HttpResponse(error_message, content_type="text/plain") '''
def get_normal_enrichment(self, comment): base_url = "https://en.wikipedia.org/wiki/" tagme.GCUBE_TOKEN = self.tagme_token annotations = tagme.annotate(comment) for annotation in annotations.get_annotations(0.4): response = requests.get(base_url + annotation.entity_title) soup = BeautifulSoup(response.text, 'html.parser') p = soup.find_all('p') wiki_text = "" for paragraph in p: wiki_text += paragraph.get_text() + " " comment += " " + wiki_text return comment
def mytagme_ann(data): annotations = tagme.annotate(data) dic = {} for ann in annotations.get_annotations(0.2): try: A, B, score = str(ann).split(" -> ")[0], str(ann).split( " -> ")[1].split(" (score: ")[0], str(ann).split( " -> ")[1].split(" (score: ")[1].split(")")[0] dic[A] = {"link": B, "score": score} except: print('error annotation about ' + ann) return dic
def return_tags(): req_data = request.get_json(force=True) text = req_data['text'] lunch_annotations = tagme.annotate(text) # Print annotations with a score higher than 0.1 entities = [] for ann in lunch_annotations.get_annotations(score_higher_than): s = str(re.findall(r'->(.*?)score:', str(ann))) entity = re.sub('[^A-Za-z0-9]+', ' ', s).strip() entities.append(entity) entities = list(dict.fromkeys(entities)) entities_output = {"Entity-Linking-Entities": entities} return jsonify(entities_output)
def get_entities(sentence: str) -> list: # Extract entities from a sentence, return information about entities res = [] for ann in tagme.annotate(sentence).annotations: entity = { 'start_pos': ann.begin, 'end_pos': ann.end, 'score': ann.score, 'title': ann.entity_title, 'tagme_id': ann.entity_id, 'wiki_id': get_wiki_id(ann.entity_title), 'kg_id': get_kg_id(ann.entity_title) } res.append(entity) return res
def tagme_annotation(quest_path): print("annotating using tagme") tagme_entity_list = [] with open(quest_path) as f: for i, line in enumerate(f): print(i) line = line.strip() ent_t_score = {} lunch_annotations = tagme.annotate(line) # Print annotations with a score higher than 0.1 for ann in lunch_annotations.get_annotations(0.2): #print(ann.entity_title) ent_t_score[ann.entity_title] = ann.score tagme_entity_list.append(ent_t_score) return tagme_entity_list
def extract_keywords_from_tweet(text: str, filterStopwords: bool) -> set: extractor = extractors.ArticleExtractor() keywords = set() # print(text) links = get_urls_from_text(text) # print("Debug: number of links:"+ str(len(links))) # From disaster dataset text = clean_text(text) keywords = text.split(" ") for key in keywords: if key in string.punctuation: keywords.remove(key) if filterStopwords == True: # Delete stopwords from text stop_words = stopwords.words('english') word_tokens = word_tokenize(text) filtered_sentence = set() for w in word_tokens: if w not in stop_words: filtered_sentence.add(w) keywords = filtered_sentence for url in links: try: external_content = extractor.get_content_from_url(url) # Debug # print("External content:" + external_content) if external_content != "": try: annotations = tagme.annotate(external_content) for ann in annotations.get_annotations( annotation_score_treshold): # # # print(ann) keywords.add(ann.entity_title) except: print("Error with tagme, skipping") except: pass return keywords
def identify_entities(self, text): result = [] annotations = tagme.annotate(text) for ann in annotations.get_annotations(0.1): name = ann.entity_title score = ann.score wiki_title = tagme.normalize_title(name) logger.info("Wiki title: " + wiki_title) mid = self.wiki_url[wiki_title] if mid is None: continue e = KBEntity(name, mid, score) ie = IdentifiedEntity(name, e, score) result.append(ie) return result
def __call__(self, src_path, filename): """ src_path - path to web-page file filename - name of web-page file """ with open(os.path.join(src_path, filename), 'r') as file: doc = ' '.join(line.strip() for line in file) cnt = Counter( ent.entity_title for ent in tagme.annotate(doc).get_annotations(self.rho_score)) if cnt: with open( f'{os.path.join(self.tgt_path, os.path.splitext(filename)[0])}.json', 'w') as tgt: json.dump(dict(cnt), tgt)
def add_europeana_node(data, db_conn, link_to_nodes=False, annotation_threshold=0.1): """ Add Europeana nodes into the MEMEX-KG :param data: the dictionary of ky->values to be saved in the KG :param db_conn: database connection :param link_to_nodes: a flag to link Europeana nodes to existing nodes in the MEMEX-KG (default FALSE) :param annotation_threshold: TAG-ME mandatory threshold """ wikidata_found = False for n in data: if link_to_nodes: # Add wikidata wids wiki_titles = [] for idx, property_name in enumerate(n[0]): if property_name == "label" or property_name == "description" or property_name == "dcCreator": value = n[1][idx] europeana_annotations = tagme.annotate(value) if europeana_annotations: for ann in europeana_annotations.get_annotations( annotation_threshold): t = ann.entity_title t = t[0].lower() + t[1:] wiki_titles.append(t) wiki_titles = list(dict.fromkeys(wiki_titles)) all_wids = [] for title in wiki_titles: wids = db_conn.get_wikidata_ids_by_label(title) if wids: all_wids.extend(wids) if all_wids: wikidata_found = True n[0].append("wids") n[1].append(all_wids) # Insert the node db_conn.queue_insert_node(n) # , additional_class="Europeana") # Link to wikidata nodes if link_to_nodes and wikidata_found: db_conn.match_with_wikidata()
def main(): # Annotate a text. print("Annotating text: ", SAMPLE_TEXT) #resp = tagme.annotate(SAMPLE_TEXT) resp = tagme.annotate(SAMPLE_TEXT, include_categories=True) print(resp) for ann in resp.annotations: print(ann) # Find mentions in a text. print("Finding mentions in text: ", SAMPLE_TEXT) resp = tagme.mentions(SAMPLE_TEXT) print(resp) for mention in resp.mentions: print(mention) # Find relatedness between one pair of entities, by title. resp = tagme.relatedness_title(["Barack_Obama", "Italy"]) print(resp) for rel in resp.relatedness: print(rel) # Find relatedness between pairs of entities, by title. resp = tagme.relatedness_title([("Barack_Obama", "Italy"), ("Italy", "Germany"), ("Italy", "BAD ENTITY NAME")]) print(resp) for rel in resp.relatedness: print(rel) # Access the relatedness response as a dictionary. resp_dict = dict(resp) print("Relatedness between Italy and Germany: ", resp_dict[("Italy", "Germany")]) # Find relatedness between one pair of entities, by wikipedia id resp = tagme.relatedness_wid((31717, 534366)) print(resp) for rel in resp.relatedness: print(rel) # Find relatedness between pairs of entities, by wikipedia id resp = tagme.relatedness_wid([(534366, 534366 + a) for a in range(1010)]) print(resp) for rel in resp.relatedness: print(rel)
def tagme_annotate(in_file, out_file, threshold=0.1): with jsonlines.open(in_file) as f_in, jsonlines.open(out_file, "w") as f_out: for line in f_in: aliases = [] spans = [] qids = [] probs = [] text = line["sentence"] text_spans = text.split() text_span_indices = [] total_len = 0 # get word boundaries for converting char spans to word spans for i, t in enumerate(text_spans): text_span_indices.append(total_len) total_len += len(t) + 1 lunch_annotations = tagme.annotate(text) # as the threshold increases, the precision increases, but the recall decreases for ann in lunch_annotations.get_annotations(threshold): mention = ann.mention try: qid = enwiki_title_to_wikidata_id(ann.entity_title) except: print(f"No wikidata id found for {ann.entity_title}") continue span_start = text_span_indices.index(ann.begin) try: span_end = text_span_indices.index(ann.end + 1) except: span_end = len(text_spans) aliases.append(mention) spans.append([span_start, span_end]) qids.append(qid) probs.append(ann.score) line["aliases"] = aliases line["qids"] = qids line["spans"] = spans line["probs"] = probs line["gold"] = [True for _ in aliases] f_out.write(line)
async def getEntities(body:TAGME_MODEL): text = body.text text = re.sub('[^.,a-zA-Z0-9 \n\.]', '', text) score = body.tagme_score tagme.GCUBE_TOKEN = str(body.tagme_token_api) lunch_annotations = tagme.annotate(text) # Print annotations with a score higher than 0.1 entities = [] for ann in lunch_annotations.get_annotations(score): s = str(re.findall(r'->(.*?)score:',str(ann))) entity = re.sub('[^A-Za-z0-9]+', ' ', s).strip() entities.append(entity) entities = list(dict.fromkeys(entities)) entities = list(set(entities)) entities_output = { "Entities" : entities } return entities_output
def run(self): tagme.GCUBE_TOKEN = TAGME_GCUBE_TOKEN with open('output/tagme/batches.pickle', 'rb') as f: batch_dict = pickle.load(f) batch_questions = batch_dict[self.question_batch] dict_annotations = {} for q in batch_questions: annotated_sentences = {} for s, text in q.text.items(): # This insures that preprocessing is matched for neural models text = ' '.join(tokenize_question(text)) annotation = annotation_to_dict(tagme.annotate(text)) annotated_sentences[s] = annotation dict_annotations[q.qnum] = annotated_sentences with open( 'output/tagme/tagged_batch_{}.pickle'.format( self.question_batch), 'wb') as f: pickle.dump(dict_annotations, f)
totalBotMicroPrecS += len(trueSet) * precS totalBotMicroRecS += len(trueSet) * recS # BOT macro scores totalBotMacroPrecS += precS totalBotMacroRecS += recS totalMyMentionsS += len(resultS) if verbose: print 'Split: ' + str(precS) + ', ' + str(recS) # get results for manually split string if doManual: # tagme has separate way to do things if mthd == 'tagme': antns = tagme.annotate(" ".join(line['text'])) resultM = [] for an in antns.get_annotations(0.005): resultM.append([an.begin,an.end,title2id(an.entity_title)]) else: # unsplit string to be manually split and mentions found try: resultM = wikifyEval(" ".join(line['text']), False, hybridC = doHybrid, maxC = maxCands, method = mthd, model = mlModel, erMethod = erMethod) except: skipped += 1 badThing.append(line) continue precM = precision(trueEntities, resultM) # precision of manual split recM = recall(trueEntities, resultM) # recall of manual split
logging.info("Processing {}".format(xml_file)) for i, doc in enumerate(get_documents(xml_file)): if doc is None: logging.warning("Could not parse document {} from {}".format(i, xml_file)) continue key, title, body, time = doc doc_path = "{}.csv".format(os.path.join(docs_path_base, key)) entities_path = "{}.csv".format(os.path.join(entities_path_base, key)) if (os.path.isfile(doc_path) and os.path.isfile(entities_path)): logging.info("Document {} already annotated, skipping.".format(key)) continue logging.info("Annotating document key={} length={} ({})".format(key, len(body), xml_file)) tagme_response = tagme.annotate(u'{} {}'.format(title, body), args.gcube_token, lang=args.lang) if not tagme_response: logging.warning("Could not annoate document {} from {} (key {})".format(i, xml_file, key)) continue annotations = tagme_response.get_annotations(min_rho=0.2) logging.info("Found {} annotations".format(len(annotations))) with open(doc_path, 'wb') as csv_doc_out: w = csv.DictWriter(csv_doc_out, encoding='utf-8', fieldnames=DOCS_CSV_FIELDS) w.writerow({'key': key, 'title': title, 'body': body, 'time': time}) with open(entities_path, 'wb') as csv_entities_out: w = csv.DictWriter(csv_entities_out, encoding='utf-8', fieldnames=ENTITIES_CSV_FIELDS) for annotation in annotations: w.writerow({'key': key, 'entity': annotation.entity_title, 'score': annotation.score, 'time': time})