def annotate(self): # we get something like "?text=<text>&views=<views>". Below two lines extract these. text = request.args.get('text') views = request.args.get('views') if text is None or views is None: return "The parameters 'text' and/or 'views' are not specified. Here is a sample input: ?text=\"This is a " \ "sample sentence. I'm happy.\"&views=POS,NER " views = views.split(",") if self.provided_view not in views: logging.info("desired view not provided by this server.") # After discussing with Daniel, this is the proper discipline to handle views not provided by this. # The appelles server will fallback to the next remote server. return "VIEW NOT PROVIDED" # create a text ann with the required views for the model required_views = ",".join(self.get_required_views()) ta_json = self.get_text_annotation_for_model( text=text, required_views=required_views) docta = TextAnnotation(json_str=ta_json) # send it to your model for inference docta = self.model.inference_on_ta(docta=docta, new_view_name=self.provided_view) # make the returned text ann to a json ta_json = json.dumps(docta.as_json) # print("returning", ta_json) return ta_json
def get_text_annotation_for_model(self, text: str, required_views: List[str]): # TODO This is a problem with ccg_nlpy text annotation, it does not like newlines (e.g., marking paragraphs) text = text.replace("\n", "") required_views = ",".join(required_views) ta_json = self.pipeline.call_server(text=text, views=required_views) ta = TextAnnotation(json_str=ta_json) return ta
def load_sentences(path, lower, zeros, ratio=1.0): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ file_list = os.listdir(path) sentences = [] label_list = set() for doc in file_list: print("Reading " + os.path.join(path, doc)) document = TextAnnotation( json_str=open(os.path.join(path, doc)).read()) ner_labels = document.view_dictionary['NER_CONLL'].cons_list if ner_labels is None: ner_labels = [] ner_dict = {} for ner_constituent in ner_labels: for span in range(ner_constituent['start'], ner_constituent['end']): if span - ner_constituent['start'] == 0: ner_dict[span] = "B-" + ner_constituent['label'] else: ner_dict[span] = "I-" + ner_constituent['label'] if ner_dict[span] not in label_list: label_list.add(ner_dict[span]) print(ner_dict[span]) try: sentences_cons_list = document.view_dictionary[ 'SENTENCE'].cons_list except KeyError as e: sentences_cons_list = [] start = 0 for end in document.sentence_end_position: sent = " ".join(document.tokens[start:end]) sentences_cons_list.append({ 'tokens': sent, 'start': start, 'end': end }) start = end for sent_constituent in sentences_cons_list: sentence = [] sent = re.split("\s+", sent_constituent['tokens']) start = sent_constituent['start'] end = sent_constituent['end'] for token, span in zip(sent, range(start, end)): if span in ner_dict: sentence.append([token, ner_dict[span]]) else: sentence.append([token, "O"]) if len(sentence) > 1: sentences.append(sentence) random.shuffle(sentences) train_sentences = sentences[:int(ratio * len(sentences))] dev_sentences = sentences[int(ratio * len(sentences)):] return train_sentences, dev_sentences
def get_text_annotation_for_model(self, text, required_views): # TODO This is a problem with ccg_nlpy text annotation, it does not like newlines (e.g., marking paragraphs) text = text.replace("\n", "") pretokenized_text = [text.split(" ")] required_views = ",".join(required_views) logging.info(f"required_views:{required_views}") ta_json = self.pipeline.call_server_pretokenized( pretokenized_text=pretokenized_text, views=required_views) ta = TextAnnotation(json_str=ta_json) return ta
def process_file(file): infile = os.path.join(args.indir, file) outfile = os.path.join(args.outdir, file + '.json') logging.info(f'Processing {infile} and output to {outfile}') try: docta = TextAnnotation( json_str=open(infile, encoding='utf8', errors='ignore').read()) except: return docid = infile.split("/")[-1] logging.info("processing docid %s", docid) cg.compute_hits_for_ta(docta=docta, outfile=outfile, args=args)
def compute_cand_stats(tafiles, args): mention2cand = dict() for i, tafile in tqdm(enumerate(tafiles)): docta = TextAnnotation(json_str=open(os.path.join(args.indir,tafile)).read()) docid = tafile.split("/")[-1] tokens2offset = defaultdict(list) # tokens2offset for token, offset in zip(docta.get_tokens, docta.char_offsets): tokens2offset[token].append(offset) candgen_view = docta.get_view("CANDGEN") for cons in candgen_view.cons_list: offset = tokens2offset[cons["tokens"]].pop(0) mention_name = f"{docid[:-5]}:{offset[0]}-{offset[1]}" if mention_name in mention2cand: logging.info("found duplicate mentions") if 'labelScoreMap' in cons: labelScoreMap = cons['labelScoreMap'] mention2cand[mention_name] = labelScoreMap else: mention2cand[mention_name] = ["NIL"] return mention2cand
def new_test_file(self, test_mens_file): self.test_mens_file = test_mens_file with open(test_mens_file, 'r') as f: tajsonstr = f.read() ta = TextAnnotation(json_str=tajsonstr) self.textanno = ta (sentences_tokenized, modified_ner_cons_list) = self.processTestDoc(ta) self.mention_lines = self.convertSent2NerToMentionLines( sentences_tokenized, modified_ner_cons_list) self.mentions = [] for line in self.mention_lines: m = Mention(line) self.mentions.append(m) self.men_idx = 0 self.num_mens = len(self.mentions) self.epochs = 0
# Load the model _, f_eval = model.build(training=False, **parameters) model.reload() # f_output = codecs.open(opts.output, 'w', 'utf-8') start = time.time() print 'Tagging...' file_list = os.listdir(opts.input) count = 0 for doc in file_list: document = TextAnnotation( json_str=open(os.path.join(opts.input, doc)).read()) token_list = document.tokens start = 0 view_as_json = {} cons_list = [] if 'NER_CONLL' in document.view_dictionary: del document.view_dictionary['NER_CONLL'] for sent_end_offset in document.sentences['sentenceEndPositions']: words_ini = token_list[start:sent_end_offset] line = " ".join(words_ini) if line: # Lowercase sentence if parameters['lower']:
cg = CandGen(lang=args.lang, year=args.year, inlinks=inlinks, tsl=tsl, tsl_concept_pair=tsl_concept_pair, tsl_translit_dict=tsl_translit_dict, spellchecker=spellchecker, classifier=None, wiki_cg=wiki_cg) cg.load_kb(args.kbdir) # Mongodb args.eid2wikisummary = MongoBackedDict(dbname="eid2wikisummary") # args.mention2url_entity = MongoBackedDict(dbname=f"mention2url_entity_{args.lang}") # args.mention2gmap_entity = MongoBackedDict(dbname=f"mention2gmap_entity_{args.lang}") returned_json = cg.compute_hits_for_ta(docta=docta, outfile=None, args=args) linking_results = returned_json["viewData"][0]["constituents"] print(returned_json) # example input file: /pool0/webserver/incoming/experiment_tmp/EDL2019/data/input/ak/AKA_NA_006644_20170516_H0025ZXL0 docta = TextAnnotation(json_str=open( '/pool0/webserver/incoming/experiment_tmp/EDL2019/data/input/ak/AKA_NA_006644_20170516_H0025ZXL0', encoding='utf8', errors='ignore').read()) call_this_for_demo(docta, 'ak', 1)
def readJsonTA(jsonfp): with open(jsonfp, 'r') as f: tajsonstr = f.read() ta = TextAnnotation(json_str=tajsonstr) return ta
def new_tajsonstr(self, tajsonstr): """ tajsonstr is a json str of a TA """ ta = TextAnnotation(json_str=tajsonstr) self.new_ta(ta)
id2wiki[tmp[id_pos]] = tmp[link_pos].split('|') output_f = open(out_file, 'w', encoding='utf8') output_f.write('\t'.join(cols + ['wiki_link\n'])) for line in tqdm(gold_f): tid = None wikiname = None entry = line[:-1].split('\t') mention_id = entry[mention_id_pos] filename,se = mention_id.split(':') file = os.path.join(input_dir, filename) if not os.path.exists(file): print(file) continue docta = TextAnnotation(json_str=open(file, encoding='utf8', errors='ignore').read()) mention_text = docta.text[int(se.split('-')[0]):int(se.split('-')[1])+1] # if not '_' in entry[mention_text_pos]: # if not entry[mention_text_pos] == mention_text: # print(entry[mention_text_pos], mention_text) entry[mention_text_pos] = mention_text assert entry[mention_text_pos] == mention_text if 'NIL' in entry[kb_pos]: wiki_link_final = "NAN" wikiname_pro = "NAN" else: wiki_links = set([item for kbid in entry[kb_pos].split('|') for item in id2wiki[kbid]]) if wiki_links == {''}: continue for wiki_link in wiki_links: