Exemple #1
0
 def annotate(self):
     # we get something like "?text=<text>&views=<views>". Below two lines extract these.
     text = request.args.get('text')
     views = request.args.get('views')
     if text is None or views is None:
         return "The parameters 'text' and/or 'views' are not specified. Here is a sample input: ?text=\"This is a " \
                "sample sentence. I'm happy.\"&views=POS,NER "
     views = views.split(",")
     if self.provided_view not in views:
         logging.info("desired view not provided by this server.")
         # After discussing with Daniel, this is the proper discipline to handle views not provided by this.
         # The appelles server will fallback to the next remote server.
         return "VIEW NOT PROVIDED"
     # create a text ann with the required views for the model
     required_views = ",".join(self.get_required_views())
     ta_json = self.get_text_annotation_for_model(
         text=text, required_views=required_views)
     docta = TextAnnotation(json_str=ta_json)
     # send it to your model for inference
     docta = self.model.inference_on_ta(docta=docta,
                                        new_view_name=self.provided_view)
     # make the returned text ann to a json
     ta_json = json.dumps(docta.as_json)
     # print("returning", ta_json)
     return ta_json
Exemple #2
0
 def get_text_annotation_for_model(self, text: str,
                                   required_views: List[str]):
     # TODO This is a problem with ccg_nlpy text annotation, it does not like newlines (e.g., marking paragraphs)
     text = text.replace("\n", "")
     required_views = ",".join(required_views)
     ta_json = self.pipeline.call_server(text=text, views=required_views)
     ta = TextAnnotation(json_str=ta_json)
     return ta
Exemple #3
0
def load_sentences(path, lower, zeros, ratio=1.0):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """

    file_list = os.listdir(path)
    sentences = []
    label_list = set()
    for doc in file_list:
        print("Reading " + os.path.join(path, doc))
        document = TextAnnotation(
            json_str=open(os.path.join(path, doc)).read())
        ner_labels = document.view_dictionary['NER_CONLL'].cons_list
        if ner_labels is None:
            ner_labels = []
        ner_dict = {}
        for ner_constituent in ner_labels:
            for span in range(ner_constituent['start'],
                              ner_constituent['end']):
                if span - ner_constituent['start'] == 0:
                    ner_dict[span] = "B-" + ner_constituent['label']
                else:
                    ner_dict[span] = "I-" + ner_constituent['label']
                if ner_dict[span] not in label_list:
                    label_list.add(ner_dict[span])
                    print(ner_dict[span])
        try:
            sentences_cons_list = document.view_dictionary[
                'SENTENCE'].cons_list
        except KeyError as e:
            sentences_cons_list = []
            start = 0
            for end in document.sentence_end_position:
                sent = " ".join(document.tokens[start:end])
                sentences_cons_list.append({
                    'tokens': sent,
                    'start': start,
                    'end': end
                })
                start = end
        for sent_constituent in sentences_cons_list:
            sentence = []
            sent = re.split("\s+", sent_constituent['tokens'])
            start = sent_constituent['start']
            end = sent_constituent['end']
            for token, span in zip(sent, range(start, end)):
                if span in ner_dict:
                    sentence.append([token, ner_dict[span]])
                else:
                    sentence.append([token, "O"])
            if len(sentence) > 1:
                sentences.append(sentence)

    random.shuffle(sentences)
    train_sentences = sentences[:int(ratio * len(sentences))]
    dev_sentences = sentences[int(ratio * len(sentences)):]
    return train_sentences, dev_sentences
 def get_text_annotation_for_model(self, text, required_views):
     # TODO This is a problem with ccg_nlpy text annotation, it does not like newlines (e.g., marking paragraphs)
     text = text.replace("\n", "")
     pretokenized_text = [text.split(" ")]
     required_views = ",".join(required_views)
     logging.info(f"required_views:{required_views}")
     ta_json = self.pipeline.call_server_pretokenized(
         pretokenized_text=pretokenized_text, views=required_views)
     ta = TextAnnotation(json_str=ta_json)
     return ta
Exemple #5
0
 def process_file(file):
     infile = os.path.join(args.indir, file)
     outfile = os.path.join(args.outdir, file + '.json')
     logging.info(f'Processing {infile} and output to {outfile}')
     try:
         docta = TextAnnotation(
             json_str=open(infile, encoding='utf8', errors='ignore').read())
     except:
         return
     docid = infile.split("/")[-1]
     logging.info("processing docid %s", docid)
     cg.compute_hits_for_ta(docta=docta, outfile=outfile, args=args)
Exemple #6
0
def compute_cand_stats(tafiles, args):
    mention2cand = dict()
    for i, tafile in tqdm(enumerate(tafiles)):
        docta = TextAnnotation(json_str=open(os.path.join(args.indir,tafile)).read())
        docid = tafile.split("/")[-1]
        tokens2offset = defaultdict(list)
        # tokens2offset
        for token, offset in zip(docta.get_tokens, docta.char_offsets):
            tokens2offset[token].append(offset)
        candgen_view = docta.get_view("CANDGEN")
        for cons in candgen_view.cons_list:
            offset = tokens2offset[cons["tokens"]].pop(0)
            mention_name = f"{docid[:-5]}:{offset[0]}-{offset[1]}"
            if mention_name in mention2cand:
                logging.info("found duplicate mentions")
            if 'labelScoreMap' in cons:
                labelScoreMap = cons['labelScoreMap']
                mention2cand[mention_name] = labelScoreMap
            else:
                mention2cand[mention_name] = ["NIL"]
    return mention2cand
    def new_test_file(self, test_mens_file):
        self.test_mens_file = test_mens_file

        with open(test_mens_file, 'r') as f:
            tajsonstr = f.read()
        ta = TextAnnotation(json_str=tajsonstr)
        self.textanno = ta

        (sentences_tokenized, modified_ner_cons_list) = self.processTestDoc(ta)

        self.mention_lines = self.convertSent2NerToMentionLines(
            sentences_tokenized, modified_ner_cons_list)

        self.mentions = []
        for line in self.mention_lines:
            m = Mention(line)
            self.mentions.append(m)

        self.men_idx = 0
        self.num_mens = len(self.mentions)
        self.epochs = 0
Exemple #8
0
# Load the model
_, f_eval = model.build(training=False, **parameters)
model.reload()

# f_output = codecs.open(opts.output, 'w', 'utf-8')
start = time.time()

print 'Tagging...'

file_list = os.listdir(opts.input)

count = 0

for doc in file_list:
    document = TextAnnotation(
        json_str=open(os.path.join(opts.input, doc)).read())
    token_list = document.tokens
    start = 0

    view_as_json = {}
    cons_list = []

    if 'NER_CONLL' in document.view_dictionary:
        del document.view_dictionary['NER_CONLL']

    for sent_end_offset in document.sentences['sentenceEndPositions']:
        words_ini = token_list[start:sent_end_offset]
        line = " ".join(words_ini)
        if line:
            # Lowercase sentence
            if parameters['lower']:
Exemple #9
0
    cg = CandGen(lang=args.lang,
                 year=args.year,
                 inlinks=inlinks,
                 tsl=tsl,
                 tsl_concept_pair=tsl_concept_pair,
                 tsl_translit_dict=tsl_translit_dict,
                 spellchecker=spellchecker,
                 classifier=None,
                 wiki_cg=wiki_cg)
    cg.load_kb(args.kbdir)

    # Mongodb
    args.eid2wikisummary = MongoBackedDict(dbname="eid2wikisummary")
    # args.mention2url_entity = MongoBackedDict(dbname=f"mention2url_entity_{args.lang}")
    # args.mention2gmap_entity = MongoBackedDict(dbname=f"mention2gmap_entity_{args.lang}")

    returned_json = cg.compute_hits_for_ta(docta=docta,
                                           outfile=None,
                                           args=args)
    linking_results = returned_json["viewData"][0]["constituents"]
    print(returned_json)


# example input file: /pool0/webserver/incoming/experiment_tmp/EDL2019/data/input/ak/AKA_NA_006644_20170516_H0025ZXL0
docta = TextAnnotation(json_str=open(
    '/pool0/webserver/incoming/experiment_tmp/EDL2019/data/input/ak/AKA_NA_006644_20170516_H0025ZXL0',
    encoding='utf8',
    errors='ignore').read())
call_this_for_demo(docta, 'ak', 1)
Exemple #10
0
def readJsonTA(jsonfp):
    with open(jsonfp, 'r') as f:
        tajsonstr = f.read()
    ta = TextAnnotation(json_str=tajsonstr)
    return ta
Exemple #11
0
 def new_tajsonstr(self, tajsonstr):
     """ tajsonstr is a json str of a TA """
     ta = TextAnnotation(json_str=tajsonstr)
     self.new_ta(ta)
  id2wiki[tmp[id_pos]] = tmp[link_pos].split('|')

output_f = open(out_file, 'w', encoding='utf8')
output_f.write('\t'.join(cols + ['wiki_link\n']))

for line in tqdm(gold_f):
  tid = None
  wikiname = None
  entry = line[:-1].split('\t')
  mention_id = entry[mention_id_pos]
  filename,se =              mention_id.split(':')
  file = os.path.join(input_dir, filename)
  if not os.path.exists(file):
    print(file)
    continue
  docta = TextAnnotation(json_str=open(file, encoding='utf8', errors='ignore').read())
  mention_text = docta.text[int(se.split('-')[0]):int(se.split('-')[1])+1]
  # if not '_' in entry[mention_text_pos]:
  #   if not entry[mention_text_pos] == mention_text:
  #     print(entry[mention_text_pos], mention_text)
  entry[mention_text_pos] = mention_text
  assert entry[mention_text_pos] == mention_text

  if 'NIL' in entry[kb_pos]:
    wiki_link_final = "NAN"
    wikiname_pro = "NAN"
  else:
    wiki_links = set([item for kbid in entry[kb_pos].split('|') for item in id2wiki[kbid]])
    if wiki_links == {''}:
      continue
    for wiki_link in wiki_links: