def cql_search(request): from pynlpl.formats import fql, cql # парсинг входящих параметров params = json.loads(request.body.decode('utf-8')) # обновление фолиа-документа по актуальным данным doc = folia.Document(id='doc') text = folia.Text(doc, id='doc.text') sentences = Sentence.objects.all() # поиск слов в документе for s in sentences: sen = text.append(folia.Sentence(doc, id=doc.id + '.s.' + str(s.id))) words = Word.objects.filter(Sentence_id=s.id) for w in words: sen.append( folia.Word(doc, id=doc.id + '.s.' + str(s.id) + '.w.' + str(w.id), text=w.value)) doc.append(text) query = fql.Query(cql.cql2fql(params['title'])) texts = query(doc) arr = [] for t in texts: arr.append(t[0].parent.id.split('s.')[1]) sens = Sentence.objects.filter(id__in=arr) # вывод результатов return render(request, 'cabinet/cql_results.html', { 'texts': texts, 'sens': sens })
def makefoliadoc(outputfile): baseid = os.path.basename(outputfile).replace('.folia.xml', '').replace('.xml', '') foliadoc = folia.Document(id=baseid) foliadoc.append(folia.Text(foliadoc, id=baseid + '.text')) if not foliadoc.declared(folia.AnnotationType.TOKEN, 'alpino-tokens'): foliadoc.declare(folia.AnnotationType.TOKEN, 'alpino-tokens') if not foliadoc.declared(folia.LemmaAnnotation, 'alpino-lemmas'): foliadoc.declare(folia.LemmaAnnotation, 'alpino-lemmas') if not foliadoc.declared(folia.SenseAnnotation, 'alpino-sense'): foliadoc.declare(folia.SenseAnnotation, 'alpino-sense') if not foliadoc.declared(folia.PosAnnotation, 'alpino-pos'): foliadoc.declare(folia.PosAnnotation, 'alpino-pos') if not foliadoc.declared(folia.AnnotationType.DEPENDENCY, 'alpino-dependency'): foliadoc.declare(folia.AnnotationType.DEPENDENCY, 'alpino-dependency') if not foliadoc.declared(folia.AnnotationType.SYNTAX, 'alpino-syntax'): foliadoc.declare(folia.AnnotationType.SYNTAX, 'alpino-syntax') if not foliadoc.declared(folia.AnnotationType.MORPHOLOGICAL, 'alpino-morphology'): foliadoc.declare(folia.AnnotationType.MORPHOLOGICAL, 'alpino-morphology') return foliadoc
def foliacat(id, outputfile, *files): totalmerges = 0 outputdoc = folia.Document(id=id) text = outputdoc.append(folia.Text(outputdoc,id=id + ".text")) for i, filename in enumerate(files): merges = 0 print("Processing " + filename, file=sys.stderr) inputdoc = folia.Document(file=filename) print("(merging document)",file=sys.stderr) for annotationtype,set in inputdoc.annotations: if not outputdoc.declared(annotationtype,set): outputdoc.declare( annotationtype, set) for d in inputdoc.data: merges += concat(text, d) print("(merged " + str(merges) + " elements, with all elements contained therein)",file=sys.stderr) totalmerges += merges print("(TOTAL: merged " + str(totalmerges) + " elements, with all elements contained therein)",file=sys.stderr) if outputfile and merges > 0: outputdoc.save(outputfile) return outputdoc
def convert_text_layer(nafparser, foliadoc): textbody = foliadoc.append(folia.Text(foliadoc, id=foliadoc.id + '.text')) naf_raw = nafparser.get_raw() textbody.append(folia.TextContent, naf_raw) prevsent_id = None prevpara_id = None paragraph = None prevword = None prev_naf_token = None for naf_token in nafparser.get_tokens(): para_id = naf_token.get_para() sent_id = naf_token.get_sent() if para_id != prevpara_id: if prevpara_id is None: #first paragraph, declare for completion's sake foliadoc.declare(folia.Paragraph, 'undefined') paragraph = textbody.append(folia.Paragraph, id=foliadoc.id + '.para' + para_id) if sent_id != prevsent_id: if paragraph: sentence = paragraph.append(folia.Sentence, id=foliadoc.id + '.sent' + sent_id) else: sentence = textbody.append(folia.Sentence, id=foliadoc.id + '.sent' + sent_id) token_id = naf_token.get_id() if prev_naf_token is not None and int(prev_naf_token.get_offset( )) + int(prev_naf_token.get_length()) == int(naf_token.get_offset()): prevword.space = False word = sentence.append(folia.Word, id=foliadoc.id + '.' + token_id) offset = int(naf_token.get_offset()) try: offset_valid = naf_raw[ offset + int(naf_token.get_length())] == naf_token.get_text() except IndexError: offset_valid = False if not offset_valid: print( "WARNING: NAF error: offset for token " + token_id + " does not align properly with raw layer! Discarding offset information for FoLiA conversion", file=sys.stderr) word.append(folia.TextContent, naf_token.get_text()) else: word.append(folia.TextContent, naf_token.get_text(), offset=naf_token.get_offset(), ref=textbody) prevword = word prev_naf_token = naf_token prevpara_id = para_id prevsent_id = sent_id return textbody
elif word: w = folia.Word(foliadoc, text=word, generate_id_in=s) if lemma: w.append( folia.LemmaAnnotation(foliadoc, cls=lemma) ) if pos: w.append( folia.PosAnnotation(foliadoc, cls=pos) ) s.append(w) else: foliadoc = folia.Document(id=foliaid) foliadoc.declare(folia.AnnotationType.TOKEN, set='http://ilk.uvt.nl/folia/sets/ucto-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) foliadoc.declare(folia.AnnotationType.POS, set='http://ilk.uvt.nl/folia/sets/cgn-legacy.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) foliadoc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) foliadoc.language('nld') text = folia.Text(foliadoc, id=foliadoc.id + '.text.1') foliadoc.append(text) curid = None for (fragment, id) in zip(data,idmap): if mode == 's' or mode == 'n': if id: s = folia.Sentence(foliadoc, id=id) else: s = folia.Sentence(foliadoc, generate_id_in=text) elif mode == 'p': if id: p = folia.Paragraph(foliadoc, id=id) else: p = folia.Paragraph(foliadoc, generate_id_in=text)
try: os.mkdir(os.path.join(outdir, collection_id)) except: pass files = list(glob.glob(os.path.join(compdir,"nl","*.gz"))) + list(glob.glob(os.path.join(compdir, "vl","*.gz"))) for path in files: text_id = os.path.basename(path).split(".")[0] print("\t" + text_id) full_id = collection_id + "_" + text_id au_id = None sentence = None doc = folia.Document(id=full_id) doc.metadatatype = folia.MetaDataType.IMDI doc.metadatafile = text_id + ".imdi" textbody = doc.append(folia.Text(doc, id=full_id+"."+text_id)) doc.declare(folia.PosAnnotation, set="hdl:1839/00-SCHM-0000-0000-000B-9") doc.declare(folia.LemmaAnnotation, set="hdl:1839/00-SCHM-0000-0000-000E-3") fin = gzip.open(path,'r') for line in fin: line = unicode(line,CGN_ENCODING) if line: if line[0:3] == '<au': end = line[8:].find('"') if end != -1: end += 8 au_id = line[8:end] sentence = textbody.append(folia.Sentence, id=full_id + ".s." + au_id) elif line[0:3] == '<mu': au_id = None