def test_create_triples(self): from amcat.models.token import TripleValues, TokenValues s = amcattest.create_test_analysis_sentence() tokens = [ TokenValues(s.id, 0, word="a", lemma="l", pos="p", major="major", minor="minor", namedentity=None), TokenValues(s.id, 1, word="b", lemma="l", pos="p", major="major", minor="minor", namedentity=None) ] t = TripleValues(s.id, 0, 1, "su") result_tokens, result_triples, corefsets = store_analysis( s.analysed_article, tokens, [t]) tr, = Triple.objects.filter(parent__sentence=s) self.assertEqual(tr.relation.label, t.relation) self.assertEqual(tr.child.word.word, "a") for tokenvalue, token in result_tokens.items(): self.assertEqual(tokenvalue.position, token.position) self.assertEqual(tokenvalue.lemma, token.word.lemma.lemma)
def test_create_lemmata(self): from amcat.models.token import TokenValues lang = amcattest.get_test_language() l1 = Lemma.objects.create(lemma="a", pos="b") tokens = [ TokenValues(None, None, None, lemma=l, pos="b", major=None, minor=None, namedentity=None) for l in "a" * 10 ] tokens += [ TokenValues(None, None, None, lemma=l, pos="c", major=None, minor=None, namedentity=None) for l in "ab" * 5 ] with self.checkMaxQueries( 3): # 1 to cache, 2 to create with different poss lemmata = create_lemmata(tokens) # are existing lemmata 'recycled'? self.assertEqual(lemmata["a", "b"].id, l1.id) # did we get the correct lemmata? self.assertEqual(set(lemmata.keys()), set([("a", "b"), ("a", "c"), ("b", "c")])) for (lemmastr, pos), lemma in lemmata.items(): self.assertEqual(lemma.lemma, lemmastr)
def test_long_strings(self): """Test whether overly long lemmata, words, and pos are truncated""" from amcat.models.token import TokenValues, TripleValues s = amcattest.create_test_analysis_sentence() longpos = TokenValues(s.id, 0, word="a", lemma="l", pos="pp", major="m", minor="m", namedentity=None) self.assertRaises(Exception, list, create_tokens([longpos])) nonepos = TokenValues(s.id, 0, word="a", lemma="l", pos="p", major="m", minor="m", namedentity=None) longvals = TokenValues(s.id, 1, word="a"*9999, lemma="l"*9999, pos="p", major="m"*9999, minor="m"*9999, namedentity=None) triple = TripleValues(s.id, 0, 1, "x"*9999) create_triples([nonepos, longvals], [triple]) # django validation for length t, = Triple.objects.filter(parent__sentence=s) t.full_clean() t.relation.full_clean() for token in (t.parent, t.child): token.full_clean() token.word.full_clean() token.word.lemma.full_clean() token.pos.full_clean()
def test_create_words(self): from amcat.models.token import TokenValues lang = amcattest.get_test_language() tokens = [] l1 = Lemma.objects.create(lemma="a", pos="b") w1 = Word.objects.create(lemma=l1, word="b") for lemma in "ab": for word in "bbcc": tokens.append( TokenValues(None, None, word=word, lemma=lemma, pos="b", major=None, minor=None, namedentity=None)) with self.checkMaxQueries( 8 ): # 2 to cache lemmata+words, 1 to create lemmata, 5 to create words words = create_words(tokens) self.assertEqual( set(words.keys()), set([("a", "b", "b"), ("a", "b", "c"), ("b", "b", "b"), ("b", "b", "c")])) for (lemmastr, pos, wordstr), word in words.items(): self.assertEqual(word.word, wordstr) self.assertEqual(word.lemma.lemma, lemmastr) self.assertEqual(words["a", "b", "b"].id, w1.id) self.assertEqual(words["a", "b", "c"].lemma_id, l1.id)
def create_tokens(sid, words, tokens): for position, s in enumerate(tokens): lemma, pos = s.rsplit("/", 1) poscat = POSMAP[pos] yield TokenValues(sid, position, words[position], lemma, poscat, pos, None)
def create_tokenvalue(analysis_article=None, **kargs): if 'analysis_sentence' not in kargs: kargs['analysis_sentence'] = create_test_analysis_sentence(analysis_article).id for key, default in dict(position=_get_next_id(), word='test_word', lemma='test_lemma', pos='T', major='test_major', minor='test_minor', namedentity=None).items(): if key not in kargs: kargs[key] = default from amcat.models.token import TokenValues return TokenValues(**kargs)
def get_tokenvalues(words, analysis_sentence): for i, info in enumerate(words): word = info['Text'] pos = info['PartOfSpeech'] poscat = POSMAP[pos] ner = info['NamedEntityTag'] ner = NERMAP[ner] if ner != 'O' else None yield TokenValues(analysis_sentence.id, i, word, info['Lemma'], poscat, pos, None, ner)
def get_token(analysis_sentence_id, token): #TokenValues = namedtuple("TokenValues", ["analysis_sentence", "position", "word", "lemma", "pos", "major", "minor", "namedentity"]) pos_major = token.find("POS").text pos = POSMAP[pos_major] ner = token.find("NER").text ner = NERMAP[ner] if ner != 'O' else None return TokenValues(analysis_sentence_id, int(token.get("id")) - 1, token.find("word").text, token.find("lemma").text, pos, pos_major, None, ner)
def create_values(sid, words): tokens = [] triples = [] for word in words: tokens.append( TokenValues(sid, int(word["id"]), word["form"], word["lemma"], map_pos(word["pos"]), word["pos"], None, None)) head = int(word["head"]) if head: triples.append( TripleValues(sid, int(word['id']), head, word['deprel'])) return tokens, triples
def run(self, _input=None): articles = self.options["articleset"].articles.only("uuid") print "[" # manually output json so we don't need to keep all in memory def sent_tuple(article, analysissentence): return (analysissentence.sentence.parnr, analysissentence.sentence.sentnr) for i, a in enumerate(articles): if i: print "," print >> sys.stderr, "{i} / {n}: {a.id} / {a.uuid}".format( n=len(articles), **locals()) sentences = list(a.sentences.all()) sentencevalues = [(s.parnr, s.sentnr, s.sentence) for s in sentences] tokens = list( Token.objects.filter( sentence__sentence__in=sentences).select_related( "sentence__sentence", "word", "word__lemma", "pos")) sent_tuples = {t: sent_tuple(a, t.sentence) for t in tokens} tokenvalues = [ TokenValues(sent_tuples[t], t.position, t.word.word, t.word.lemma.lemma, t.pos.pos, t.pos.major, t.pos.minor, None) for t in tokens ] triples = list( Triple.objects.filter(child__in=tokens).select_related( "child", "parent", "relation")) triplevalues = [ TripleValues(sent_tuples[t.child], t.child.position, t.parent.position, t.relation.label) for t in triples ] data = dict(article=a.uuid, sentences=sentencevalues, tokens=tokenvalues, triples=triplevalues) json.dump(data, sys.stdout) sys.stdout.flush() print "]"
def test_create_tokens(self): from amcat.models.token import TokenValues s = amcattest.create_test_analysis_sentence() tokens = [ TokenValues(s.id, 2, word="w", lemma="l", pos="p", major="major", minor="minor", namedentity=None) ] token, = dict(create_tokens(tokens)).values() self.assertEqual(token.word.lemma.lemma, "l")
def interpret_token(sid, lemma, word, begin, _end, dummypos, dummypos2, pos): if "(" in pos: major, minor = pos.split("(", 1) minor = minor[:-1] else: major, minor = pos, None if "_" in major: m2 = major.split("_")[-1] else: m2 = major cat = POSMAP.get(m2) if not cat: raise Exception("Unknown POS: %r (%s/%s/%s/%s)" % (m2, major, begin, word, pos)) return TokenValues(sid, int(begin), word, lemma, cat, major, minor, None)
def test_process(self): from amcat.models.token import TokenValues class X(AnalysisScript): def __init__(self): super(X, self).__init__(analysis=None, tokens=True, triples=False) def get_tokens(self, analysis_sentence, memo=None): for i, x in enumerate( analysis_sentence.sentence.sentence.split()): yield TokenValues(analysis_sentence, i + 1, x, None, None, None, None) a = amcattest.create_test_analysis_sentence( sentence=amcattest.create_test_sentence( sentence="dit is een test")) tokens, triples = list(X().process_sentence(a)) print(tokens) self.assertIsNone(triples) self.assertEqual( list(tokens)[0], (TokenValues(a, 1, "dit", None, None, None, None)))
def test_interpret_xml(self): # <!-- Mary met John. She likes him. --> analysis_sentences=range(10) tokens, triples, corefsets = interpret_xml(analysis_sentences, self._get_test_xml()) self.assertEqual(set(tokens), { TokenValues(0, 0, 'Mary', 'Mary', 'N', "NNP", None, 'P'), TokenValues(0, 1, 'met', 'meet', 'V', "VBD", None, None), TokenValues(0, 2, 'John', 'John', 'N', "NNP", None, 'P'), TokenValues(1, 0, 'She', 'she', 'O', "PRP", None, None), TokenValues(1, 1, 'likes', 'like', 'V', "VBZ", None, None), TokenValues(1, 2, 'him', 'he', 'O', "PRP", None, None), }) self.assertEqual(set(triples), { TripleValues(0, 0, 1, "nsubj"), TripleValues(0, 2, 1, "dobj"), TripleValues(1, 0, 1, "nsubj"), TripleValues(1, 2, 1, "dobj"), }) self.assertEqual({frozenset(coref) for coref in corefsets}, { frozenset([(0,0), (1,0)]), frozenset([(0,2), (1,2)])})
def get_tokens(self, analysis_sentence, memo=None): for i, x in enumerate( analysis_sentence.sentence.sentence.split()): yield TokenValues(analysis_sentence, i + 1, x, None, None, None, None)
def get_tokens(self, analysis_sentence, memo=None): if memo is None: memo = self.preprocess_sentence(analysis_sentence) for line in memo: position, word, lemma, pos = [line[i] for i in (0, 1, 2, 4)] yield TokenValues(analysis_sentence, int(position) - 1, word, lemma, *read_pos(pos))
def clean_tokens(self): tokens = self.cleaned_data["tokens"] try: return [TokenValues(*fields) for fields in json.loads(tokens)] except ValueError as e: raise forms.ValidationError(e)