def foliacat(id, outputfile, *files): totalmerges = 0 outputdoc = folia.Document(id=id) text = outputdoc.append(folia.Text(outputdoc,id=id + ".text")) for i, filename in enumerate(files): merges = 0 print("Processing " + filename, file=sys.stderr) inputdoc = folia.Document(file=filename) print("(merging document)",file=sys.stderr) for annotationtype,set in inputdoc.annotations: if not outputdoc.declared(annotationtype,set): outputdoc.declare( annotationtype, set) for d in inputdoc.data: merges += concat(text, d) print("(merged " + str(merges) + " elements, with all elements contained therein)",file=sys.stderr) totalmerges += merges print("(TOTAL: merged " + str(totalmerges) + " elements, with all elements contained therein)",file=sys.stderr) if outputfile and merges > 0: outputdoc.save(outputfile) return outputdoc
def compare(first, second): os.chdir(first) first_annotations = [] for elem in glob.glob("*.xml"): doc = folia.Document(file=elem) try: first_annotations.append(doc.metadata.data['Centrality']) except: first_annotations.append("Annotation Empty") os.chdir(second) second_annotations = [] for elem in glob.glob("*.xml"): doc = folia.Document(file=elem) try: second_annotations.append(doc.metadata.data['Centrality']) except: second_annotations.append("Annotation Empty") first_annotator_final = [] second_annotator_final = [] first_not_labeled = [] second_not_labeled = [] for idx, elem in enumerate(first_annotations): if (elem == "Urban" or elem == "Rural") and (second_annotations[idx] == "Urban" or second_annotations[idx] == "Rural"): first_annotator_final.append(elem) second_annotator_final.append(second_annotations[idx]) else: first_not_labeled.append(elem) second_not_labeled.append(second_annotations[idx]) return [[first_annotator_final, second_annotator_final], [first_not_labeled, second_not_labeled]]
def main(): global repetitions, target files = [] try: begin = 1 if os.path.exists(sys.argv[1]): begin = 1 selectedtests = "all" repetitions = 1 else: selectedtests = sys.argv[1].split(',') if os.path.exists(sys.argv[2]): repetitions = 1 begin = 2 else: repetitions = int(sys.argv[2]) begin = 3 filesordirs = sys.argv[begin:] except: print("Syntax: folia_benchmark [testfunctions [repetitions]] files-or-directories+",file=sys.stderr) print(" testfunctions is a comma separated list of function names, or the special keyword 'all'", file=sys.stderr) print(" directories are recursively searched for files with the extension folia.xml, +gz and +bz2 is supported too.", file=sys.stderr) sys.exit(2) for fd in filesordirs: if not os.path.exists(fd): raise Exception("No such file or directory" + fd) if os.path.isfile(fd): files.append(fd) elif os.path.isdir(fd): dirs = [fd] while dirs: dir = dirs.pop(0) for filename in glob.glob(dir + "/*"): if os.path.isdir(filename): dirs.append(filename) elif filename.endswith('.folia.xml') or filename.endswith('.folia.xml.gz') or filename.endswith('.folia.xml.bz2'): files.append(filename) for f in ('loadfile','loadfileleakbypass','readerwords'): if f in selectedtests or 'all' in selectedtests: for filename in files: globals()[f](filename=filename) for f in ('xml','text','json','countwords','selectwords','nextwords','ancestors','selectwordsfql','selectwordsfqlforp','selectwordsfqlxml','selectwordsfqlwhere','editwordsfql', 'addelement' ): if f in selectedtests or 'all' in selectedtests: for filename in files: doc = folia.Document(file=filename) globals()[f](doc=doc) for f in ('memtest',): if f in selectedtests or 'all' in selectedtests: for filename in files: doc = folia.Document(file=filename) print("memtest -- Memory test on document " + filename + " -- memory consumption estimated at " + str(round(asizeof.asizeof(doc) / 1024 / 1024,2)) + " MB" + " (filesize " + str(round(os.path.getsize(filename)/1024/1024,2)) + " MB)")
def process(data): i, filename = data category = os.path.basename(os.path.dirname(filename)) progress = round((i+1) / float(len(index)) * 100,1) print("#" + str(i+1) + " " + filename + ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' + str(progress) + '%',file=sys.stderr) try: doc = folia.Document(file=filename) except Exception as e: print("ERROR loading " + filename + ":" + str(e),file=sys.stderr) return False filename = filename.replace(sonardir,'') if filename[0] == '/': filename = filename[1:] if filename[-4:] == '.pos': filename = filename[:-4] if filename[-4:] == '.tok': filename = filename[:-4] if filename[-4:] == '.ilk': filename = filename[:-4] #Load document prior to tokenisation try: pretokdoc = folia.Document(file=sonardir + '/' + filename) except: print("WARNING unable to load pretokdoc " + filename,file=sys.stderr) pretokdoc = None if pretokdoc: for p2 in pretokdoc.paragraphs(): try: p = doc[p2.id] except: print("ERROR: Paragraph " + p2.id + " not found. Tokenised and pre-tokenised versions out of sync?",file=sys.stderr) continue if p2.text: p.text = p2.text try: os.mkdir(foliadir + os.path.dirname(filename)) except: pass try: doc.save(foliadir + filename) except: print("ERROR saving " + foliadir + filename,file=sys.stderr) try: f = codecs.open(foliadir + filename.replace('.xml','.tok.txt'),'w','utf-8') f.write(unicode(doc)) f.close() except: print("ERROR saving " + foliadir + filename.replace('.xml','.tok.txt'),file=sys.stderr) sys.stdout.flush() sys.stderr.flush() return True
def process(filename, outputfile=None): print("Processing " + filename, file=sys.stderr) count = Counter() try: doc = folia.Document(file=filename) count['documents'] += 1 for e in doc.select(folia.AbstractElement): if e.XMLTAG and (not settings.types or e.XMLTAG in settings.types): count[e.XMLTAG] += 1 for constraintag, constrainf in settings.constraints: if not constrainf(count[constraintag]): print("Skipping due to unmet constraints (" + constraintag + "): " + filename, file=sys.stderr) return Counter({'skipped_documents': 1}) print("Counted " + filename, file=sys.stderr) except Exception as e: if settings.ignoreerrors: print("ERROR: An exception was raised whilst processing " + filename + ":", e, file=sys.stderr) else: raise return count
def foliamerge(outputfile, *files, **kwargs): asalternative = 'asalternative' in kwargs and kwargs['asalternative'] outputdoc = None merges = 0 for i, filename in enumerate(files): print("Processing " + filename, file=sys.stderr) inputdoc = folia.Document(file=filename) if i == 0: print("(pivot document)", file=sys.stderr) outputdoc = inputdoc else: print("(merging document)", file=sys.stderr) for annotationtype, set in inputdoc.annotations: if not outputdoc.declared(annotationtype, set): outputdoc.declare(annotationtype, set) for e in inputdoc: merges += mergechildren(e, outputdoc, asalternative) if outputfile and merges > 0: outputdoc.save(outputfile) return outputdoc
def compare(path,doc): convert(path,doc) ann = Annotations(path+doc) fdoc = folia.Document(file=path+doc+".xml") #test entities for ent in ann.get_textbounds(): try: found=fdoc[ent.id] text = [str(a) for a in found.wrefs()] if ent.tail.strip() != " ".join(text): print "error: not found entity" print ent return False except KeyError: print "error: not found entity" print ent return False #test relations for rel in ann.get_relations(): try: found=fdoc[rel.id] arefs = found.select(folia.AlignReference) if not (any(a.id == rel.arg1 for a in arefs) and any(a.id == rel.arg2 for a in arefs)): print "error: not found relation" print rel return False except KeyError: print "error: not found relation" print rel return False #test events for event in ann.get_events(): try: found=fdoc[event.id] arefs = found.select(folia.AlignReference) for role,rid in event.args: if not any(a.id == rid for a in arefs) : print "error: not found relation" print rel return False except KeyError: print "error: not found relation" print rel return False #test attributes for attr in ann.get_attributes(): try: found=fdoc[attr.target] if not any(fattr.cls == str(attr.value) and fattr.subset == attr.type for fattr in found.select(folia.Feature)) : print "error: not found attr" print attr print return False except KeyError: print "error: not found attr" print rel return False print "file "+path+doc+" is OK" return True
def makefoliadoc(outputfile): baseid = os.path.basename(outputfile).replace('.folia.xml', '').replace('.xml', '') foliadoc = folia.Document(id=baseid) foliadoc.append(folia.Text(foliadoc, id=baseid + '.text')) if not foliadoc.declared(folia.AnnotationType.TOKEN, 'alpino-tokens'): foliadoc.declare(folia.AnnotationType.TOKEN, 'alpino-tokens') if not foliadoc.declared(folia.LemmaAnnotation, 'alpino-lemmas'): foliadoc.declare(folia.LemmaAnnotation, 'alpino-lemmas') if not foliadoc.declared(folia.SenseAnnotation, 'alpino-sense'): foliadoc.declare(folia.SenseAnnotation, 'alpino-sense') if not foliadoc.declared(folia.PosAnnotation, 'alpino-pos'): foliadoc.declare(folia.PosAnnotation, 'alpino-pos') if not foliadoc.declared(folia.AnnotationType.DEPENDENCY, 'alpino-dependency'): foliadoc.declare(folia.AnnotationType.DEPENDENCY, 'alpino-dependency') if not foliadoc.declared(folia.AnnotationType.SYNTAX, 'alpino-syntax'): foliadoc.declare(folia.AnnotationType.SYNTAX, 'alpino-syntax') if not foliadoc.declared(folia.AnnotationType.MORPHOLOGICAL, 'alpino-morphology'): foliadoc.declare(folia.AnnotationType.MORPHOLOGICAL, 'alpino-morphology') return foliadoc
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "-h", ["help"]) except getopt.GetoptError as err: print(str(err), file=sys.stderr) usage() sys.exit(2) for o, a in opts: if o == '-h' or o == '--help': usage() sys.exit(0) else: raise Exception("No such option: " + o) if len(args) < 2: usage() sys.exit(2) else: alpinofiles = [] for i, arg in enumerate(args): if i < len(args) - 1: alpinofiles.append(arg) foliafile = args[-1] if os.path.exists(foliafile): doc = folia.Document(file=foliafile) else: doc = makefoliadoc(foliafile) for alpinofile in alpinofiles: doc = alpino2folia(alpinofile, doc) doc.save(foliafile)
def convert(f_i, f_o=None): """ f_i/f_o: input/output file name/path without extension (str) ... """ doc_i = Eaf(''.join([f_i, '.eaf'])) if not f_o: f_o = f_i # https://pynlpl.readthedocs.io/en/latest/folia.html#editing-folia # https://pynlpl.readthedocs.io/en/latest/folia.html#adding-structure # https://pynlpl.readthedocs.io/en/latest/folia.html#structure-annotation-types print(os.path.basename(f_o)) doc_o = folia.Document(id=os.path.basename(f_o)) # https://github.com/proycon/folia/blob/master/foliatools/conllu2folia.py doc_o.declare(folia.LemmaAnnotation, set=SET_LEMMA_MYSTEM, annotator="Mystem") doc_o.declare(folia.PosAnnotation, set=SET_POS_MYSTEM, annotator="Mystem") doc_o.declare(folia.PosAnnotation, set=SET_POS, annotator="BiRCh group") doc_o.declare(folia.SyntacticUnit, set=SET_SU, annotator="BiRCh group") speech = doc_o.append(folia.Speech) for aa in create_conversation(get_aas(doc_i)): utterance = speech.append(folia.Utterance, id=aa[0], speaker=aa[1], begintime=aa[2], endtime=aa[3]) # https://docs.python.org/3/library/string.html#formatspec #utterance.append(folia.Word,'{:10}:'.format(aa[1])) utterance.append(folia.Word, '{}:'.format(aa[1].upper())) for w in get_tokens(aa[4]): # handle visibility of tokens in the form of tags if len(w) > 1 and w[0] == '<' and w[1] != '$': #print(w) w = '<$' + w[1:] token = utterance.append(folia.Word, w) if is_token_mystem(w): analysis_mystem = m.analyze(w)[0]['analysis'] if analysis_mystem: # mystem's lexeme -> lemma annotation (???) if 'lex' in analysis_mystem[0]: token.append(folia.LemmaAnnotation, cls=analysis_mystem[0]['lex'], set=SET_LEMMA_MYSTEM) if 'gr' in analysis_mystem[0]: pos_plus = analysis_mystem[0]['gr'].strip() pos, features = analyze_mystem_gr(pos_plus) an_pos = token.append(folia.PosAnnotation, head=pos, cls=pos_plus, set=SET_POS_MYSTEM) # https://pynlpl.readthedocs.io/en/latest/folia.html#features an_pos.append(folia.Feature, subset='all', cls=features) doc_o.save(''.join([f_i, '.folia.xml']))
def process_file(csv_writer, filename): """ Reads a single FoLiA .xml-file, loops over its Sentences, and writes the annotations (Correction/SemanticRole) to the csv file. """ doc = folia.Document(file=filename) for sentence in doc.sentences(): sentence_nr = sentence.id.split('.')[-1] # Add Corrections on Sentence and Word level csv_writer.writerows(get_corrections(sentence, doc.id, sentence_nr)) for word in sentence.words(): csv_writer.writerows(get_corrections(word, doc.id, sentence_nr)) # Add SemanticRoles for semrole in sentence.select(folia.SemanticRole): problem = get_feature(semrole, 'problem') pos = get_feature(semrole, 'pos') try: s_original = sentence.text( correctionhandling=folia.CorrectionHandling.ORIGINAL) s_corrected = sentence.text( correctionhandling=folia.CorrectionHandling.CURRENT) except folia.NoSuchText: s_original = '' s_corrected = '' csv_writer.writerow([ doc.id, sentence_nr, get_text_from_semrole(semrole), '', semrole.cls, problem, pos, s_original, s_corrected ])
def process(filename, queries): try: print("Processing " + filename, file=sys.stderr) doc = folia.Document(file=filename) dosave = False for query in queries: if query.format == "python": query.format = "xml" output = query(doc) print(output) if query.action and query.action.action in ('EDIT', 'DELETE', 'SUBSTITUTE', 'PREPEND', 'APPEND'): dosave = True #save document if changes are made if dosave: print("Saving " + filename, file=sys.stderr) doc.save() except Exception as e: if settings.ignoreerrors: print("ERROR: An exception was raised whilst processing " + filename + ":", e, file=sys.stderr) else: raise
def folia_docnameetypewords2file(inpath, outpath): outfile = open(outpath, 'w') ids = [] sentences_num = 0 if os.path.isdir(inpath): for filename in os.listdir(inpath): doc = folia.Document(file=inpath + '/' + filename) if filename == "https__timesofindia.indiatimes.com_city_hyderabad_1st-anniversary-of-anti-power-hike-rally_articleshow_727307023.folia.xml": print("Here") docnamewritten = False for h, sentence in enumerate(doc.sentences()): for layer in sentence.select(folia.EntitiesLayer): for i, entity in enumerate(layer.select(folia.Entity)): if entity.cls == 'etype': if not docnamewritten: outfile.write('\n' + filename + '\n') docnamewritten = True sentence_tokenized = sentence.select(folia.Word) words_folia = list(sentence_tokenized) word_classes = [w.cls for w in words_folia] if 'URL' in word_classes: continue sentences_num += 1 for word in entity.wrefs(): word_text = word.text() outfile.write(word_text + '\n') else: print("TODO: Handling of a single Folia file instead of a folder of Folia files.") outfile.close()
def folia_sentences2file(inpath, outpath): outfile = open(outpath, 'w') ids = [] sentences_num = 0 if os.path.isdir(inpath): for filename in os.listdir(inpath): doc = folia.Document(file=inpath + '/' + filename) for h, sentence in enumerate(doc.sentences()): sentence_tokenized = sentence.select(folia.Word) words_folia = list(sentence_tokenized) word_classes = [w.cls for w in words_folia] if 'URL' in word_classes: continue sentences_num += 1 for i,word in enumerate(words_folia): w_id = word.id w_text = word.text() if w_id in ids: continue if w_text == '<P>': continue ids.append(w_id) # word.next() if NoneType then it means <entities> tag is hit. Now it is time for newline. # word.next() check is necessary for sentences having entities tagged. len(words_folia) check does not do in that case. It # counts wrefs inside the entities as well as w as words. if (not word.next()) or i + 1 == len(words_folia): outfile.write(w_text.lower() + '\n') else: outfile.write(w_text.lower() + ' ') else: print("TODO: Handling of a single Folia file instead of a folder of Folia files.") outfile.close()
def process(target): print "Processing " + target if os.path.isdir(target): print "Descending into directory " + target for f in glob.glob(target + '/*'): process(f) elif os.path.isfile(target) and target[-4:] == '.xml': print "Loading " + target try: doc = folia.Document(file=target) except lxml.etree.XMLSyntaxError: print >>sys.stderr, "UNABLE TO LOAD " + target + " (XML SYNTAX ERROR!)" return None changed = False for word in doc.words(): try: pos = word.annotation(folia.PosAnnotation) except folia.NoSuchAnnotation: continue try: word.replace( cgn.parse_cgn_postag(pos.cls) ) changed = True except cgn.InvalidTagException: print >>sys.stderr, "WARNING: INVALID TAG " + pos.cls continue if changed: print "Saving..." doc.save()
def __get_folia_doc__(self, tokens): doc = folia.Document(id='nltk-sentence') folia_sent = doc.add(folia.Text) for tok, pos in tokens: word = folia_sent.add(folia.Word, tok) word.add(folia.PosAnnotation(None, set='custom', cls=pos)) return doc
def upload(self, *namespaceargs): namespace = validatenamespace('/'.join(namespaceargs)) log("In upload, namespace=" + namespace) response = {'version':VERSION} cl = cherrypy.request.headers['Content-Length'] data = cherrypy.request.body.read(int(cl)) cherrypy.response.headers['Content-Type'] = 'application/json' #data =cherrypy.request.params['data'] try: log("Loading document from upload") doc = folia.Document(string=data,setdefinitions=self.docstore.setdefinitions, loadsetdefinitions=True) if not self.allowtextredundancy: for e in doc.data: cleantextredundancy(e) doc.changed = True response['docid'] = doc.id self.docstore[(namespace,doc.id)] = doc except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() formatted_lines = traceback.format_exc().splitlines() traceback.print_tb(exc_traceback, limit=50, file=sys.stderr) response['error'] = "Uploaded file is no valid FoLiA Document: " + str(e) + " -- " "\n".join(formatted_lines) log(response['error']) if logfile: traceback.print_tb(exc_traceback, limit=50, file=logfile) return json.dumps(response).encode('utf-8') filename = self.docstore.getfilename( (namespace, doc.id)) i = 1 while os.path.exists(filename): filename = self.docstore.getfilename( (namespace, doc.id + "." + str(i))) i += 1 self.docstore.save((namespace,doc.id), "Initial upload") return json.dumps(response).encode('utf-8')
def process(filename): print >> sys.stderr, "Processing " + filename doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' + settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename, True) return freqlist
def process_file(filename): """ Reads a single FoLiA .xml-file, loops over its Sentences, and writes the text to the csv file. """ bname, _ = os.path.splitext(os.path.basename(filename)) dname = os.path.dirname(filename) orig_name = os.path.join(dname, 'orig_{}.txt'.format(bname)) corr_name = os.path.join(dname, 'corr_{}.txt'.format(bname)) with codecs.open(orig_name, 'wb', 'utf-8') as orig_file: with codecs.open(corr_name, 'wb', 'utf-8') as corr_file: doc = folia.Document(file=filename) for sentence in doc.sentences(): try: s_original = sentence.text( correctionhandling=folia.CorrectionHandling.ORIGINAL) s_corrected = sentence.text( correctionhandling=folia.CorrectionHandling.CURRENT) except folia.NoSuchText: s_original = '' s_corrected = '' orig_file.write(replace_specials(s_original)) orig_file.write('\n') corr_file.write(replace_specials(s_corrected)) corr_file.write('\n')
def cql_search(request): from pynlpl.formats import fql, cql # парсинг входящих параметров params = json.loads(request.body.decode('utf-8')) # обновление фолиа-документа по актуальным данным doc = folia.Document(id='doc') text = folia.Text(doc, id='doc.text') sentences = Sentence.objects.all() # поиск слов в документе for s in sentences: sen = text.append(folia.Sentence(doc, id=doc.id + '.s.' + str(s.id))) words = Word.objects.filter(Sentence_id=s.id) for w in words: sen.append( folia.Word(doc, id=doc.id + '.s.' + str(s.id) + '.w.' + str(w.id), text=w.value)) doc.append(text) query = fql.Query(cql.cql2fql(params['title'])) texts = query(doc) arr = [] for t in texts: arr.append(t[0].parent.id.split('s.')[1]) sens = Sentence.objects.filter(id__in=arr) # вывод результатов return render(request, 'cabinet/cql_results.html', { 'texts': texts, 'sens': sens })
def load(self,key, forcereload=False): if key[0] == "testflat": key = ("testflat", "testflat") self.use(key) filename = self.getfilename(key) if time.time() - self.lastunloadcheck > 900: #no unload check for 15 mins? background thread seems to have crashed? self.fail = True #trigger lockdown self.forceunload() #force unload of everything raise NoSuchDocument("Document Server is in lockdown due to loss of contact with autoupdater thread, refusing to process new documents...") if key not in self or forcereload: if not os.path.exists(filename): log("File not found: " + filename) self.done(key) raise NoSuchDocument if self.fail and not self.ignorefail: raise NoSuchDocument("Document Server is in lockdown due to earlier failure during XML serialisation, refusing to process new documents...") log("Loading " + filename) try: self.data[key] = folia.Document(file=filename, setdefinitions=self.setdefinitions, loadsetdefinitions=True) self.data[key].changed = False except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_tb(exc_traceback, limit=50, file=sys.stderr) log("ERROR reading file " + filename + ": " + str(e)) if logfile: traceback.print_tb(exc_traceback, limit=50, file=logfile) self.done(key) raise self.lastaccess[key]['NOSID'] = time.time() self.done(key) return self.data[key]
def correct(filename, corrected, original, acceptsuggestion, setfilter, classfilter, output): changed = False try: doc = folia.Document(file=filename) for text in doc: for correction in list(text.select(folia.Correction, setfilter)): if not classfilter or correction.cls == classfilter: if original: if correction.hasoriginal(): #restore original print("Restoring original version for " + str(correction.id), file=sys.stderr) replace(correction, correction.original()) changed = True elif correction.hasoriginal( True): #check for empty original #insertion, remove it correction.parent.remove(correction) elif corrected: if correction.hasnew(): print("Keeping corrected version for " + str(correction.id), file=sys.stderr) replace(correction, correction.new()) changed = True elif correction.hassuggestions() and acceptsuggestion: bestsuggestion = None changed = True for suggestion in correction.hassuggestions(): if not bestsuggestion or ( suggestion.confidence and not bestsuggestion.confidence) or ( suggestion.confidence and bestsuggestion.confidence and suggestion.confidence > bestsuggestion.confidence): bestsuggestion = suggestion if bestsuggestion: if corrected: replace(correction, bestsuggestion) else: raise NotImplementedError #TODO if output: print(correction.xmlstring()) if changed: if settings.stdout: print(doc.xmlstring()) else: doc.save() except Exception as e: if settings.ignoreerrors: print("ERROR: An exception was raised whilst processing " + filename + ":", e, file=sys.stderr) else: raise
def build_spacy_doc(folia_file): try: folia_doc = folia.Document(file=folia_file) nlp = spacy.load('en') doc = nlp(folia_doc.text()) return (doc) except: print("Error occurred while processing " + folia_file)
def folia_creator(id, data, docname): doc = folia.Document(id=id) for i in range(0, len(data)): text = doc.add(folia.Text) text.add(folia.Word, data[str(i)]['value']) text.add(folia.Sentence, data[str(i)]['text']) doc.save('./files/' + docname) return True
def getFoliaTokens(sentenceId): corpus = [] filepath = "data/EIFD-FlatData/" + sentenceId.split('.')[0] + ".folia.xml" doc = folia.Document(file=filepath) sentence = doc[sentenceId] for word in sentence.words(): corpus.append(word.text()) return corpus
def main(): parser = argparse.ArgumentParser(description="Convert FoLiA to JSON Shared Task format", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-C',dest='corrections',help="Strip corrections", action='store_false',default=True) parser.add_argument('file', nargs=1, help='FoLiA Document (input)') args = parser.parse_args() doc = folia.Document(file=args.file[0]) data = folia2json(doc, args.corrections) print(json.dumps(data, ensure_ascii=False, indent=4))
def process(filename, patterns): print >> sys.stderr, "Processing " + filename doc = folia.Document(file=filename) for match in doc.findwords(*patterns): s = u"" for token in match: s += u"\t" + token.text() s = filename + "\t" + match[0].id + s print s.encode(settings.encoding)
def buildfromfolia(self, files, encoding='utf-8'): freqlist = FrequencyList() if isinstance(files, str): files = [files] for filename in files: f = folia.Document(file=filename) for sentence in f.sentences(): tokens = sentence.toktext().split(' ') freqlist.append(tokens) self.buildfromfreqlist(freqlist)
def get_annotations(first, second, adjudication_path): os.chdir(first) filename = [] for elem in glob.glob("*.xml"): filename.append(elem) filetext = [] for elem in glob.glob("*.xml"): doc = folia.Document(file=elem) filetext.append(doc.text()) first_annotations = [] for elem in glob.glob("*.xml"): doc = folia.Document(file=elem) try: first_annotations.append(doc.metadata.data['Violent']) except: first_annotations.append("Annotation Empty") os.chdir(second) second_annotations = [] for elem in glob.glob("*.xml"): doc = folia.Document(file=elem) try: second_annotations.append(doc.metadata.data['Violent']) except: second_annotations.append("Annotation Empty") adjudication = [] os.chdir(adjudication_path) for elem in filename: doc = folia.Document(file=elem) try: adjudication.append(doc.metadata.data['Violent']) except: adjudication.append("Adjudication Empty") zip_list = list( zip(filename, filetext, first_annotations, second_annotations, adjudication)) df = pd.DataFrame( zip_list, columns=["Filename", "File Text", "First", "Second", "Adjudication"]) return df
def folia2sentences(path, tagFormat): sentences_as_tokens = [] ids = [] id2idx = {} idx2id = {} all_tokens = [] actual_tags = [] if os.path.isdir(path): idx = -1 for filename in os.listdir(path): doc = folia.Document(file=path + '/' + filename) for h, sentence in enumerate(doc.sentences()): sentence_tokenized = sentence.select(folia.Word) words_folia = list(sentence_tokenized) sentence_tokens = [] for word in words_folia: w_id = word.id w_text = word.text() if w_id in ids: continue idx = idx + 1 if w_text == '<P>': idx = idx - 1 continue #if w_text == 'krishnappa': # idx = idx - 1 # continue ids.append(w_id) id2idx[w_id] = idx idx2id[idx] = w_id actual_tags.append('O') sentence_tokens.append(w_text) all_tokens.append(w_text) sentences_as_tokens.append(sentence_tokens) for layer in sentence.select(folia.EntitiesLayer): for entity in layer.select(folia.Entity): for word in entity.wrefs(): word_id = word.id _idx = id2idx[word_id] if tagFormat == 'stanford': tag = foliaclass2stanfordtag(entity) elif tagFormat == 'conll': print( 'TODO: reuse codes that output files to output objects instead.' ) elif tagFormat == 'raw': tag = foliaclass2rawtag(entity) actual_tags[_idx] = tag else: print( "TODO: Handling of a single Folia file instead of a folder of Folia files." ) return [sentences_as_tokens, all_tokens, actual_tags]