Example #1
0
def foliacat(id, outputfile, *files):
    totalmerges = 0
    outputdoc = folia.Document(id=id)
    text = outputdoc.append(folia.Text(outputdoc,id=id + ".text"))
    for i, filename in enumerate(files):
        merges = 0
        print("Processing " + filename, file=sys.stderr)
        inputdoc = folia.Document(file=filename)
        print("(merging document)",file=sys.stderr)

        for annotationtype,set in inputdoc.annotations:
            if not outputdoc.declared(annotationtype,set):
                outputdoc.declare( annotationtype, set)

        for d in inputdoc.data:
            merges += concat(text, d)

        print("(merged " + str(merges) + " elements, with all elements contained therein)",file=sys.stderr)
        totalmerges += merges

    print("(TOTAL: merged " + str(totalmerges) + " elements, with all elements contained therein)",file=sys.stderr)
    if outputfile and merges > 0:
        outputdoc.save(outputfile)

    return outputdoc
Example #2
0
def compare(first, second):
    os.chdir(first)
    first_annotations = []
    for elem in glob.glob("*.xml"):
        doc = folia.Document(file=elem)
        try:
            first_annotations.append(doc.metadata.data['Centrality'])
        except:
            first_annotations.append("Annotation Empty")
    os.chdir(second)
    second_annotations = []
    for elem in glob.glob("*.xml"):
        doc = folia.Document(file=elem)
        try:
            second_annotations.append(doc.metadata.data['Centrality'])
        except:
            second_annotations.append("Annotation Empty")

    first_annotator_final = []
    second_annotator_final = []
    first_not_labeled = []
    second_not_labeled = []
    for idx, elem in enumerate(first_annotations):
        if (elem == "Urban"
                or elem == "Rural") and (second_annotations[idx] == "Urban" or
                                         second_annotations[idx] == "Rural"):
            first_annotator_final.append(elem)
            second_annotator_final.append(second_annotations[idx])
        else:
            first_not_labeled.append(elem)
            second_not_labeled.append(second_annotations[idx])

    return [[first_annotator_final, second_annotator_final],
            [first_not_labeled, second_not_labeled]]
Example #3
0
def main():
    global repetitions, target
    files = []
    try:
        begin = 1
        if os.path.exists(sys.argv[1]):
            begin = 1
            selectedtests = "all"
            repetitions = 1
        else:
            selectedtests = sys.argv[1].split(',')
            if os.path.exists(sys.argv[2]):
                repetitions = 1
                begin = 2
            else:
                repetitions = int(sys.argv[2])
                begin = 3
        filesordirs = sys.argv[begin:]
    except:
        print("Syntax: folia_benchmark [testfunctions [repetitions]] files-or-directories+",file=sys.stderr)
        print(" testfunctions is a comma separated list of function names, or the special keyword 'all'", file=sys.stderr)
        print(" directories are recursively searched for files with the extension folia.xml, +gz and +bz2 is supported too.", file=sys.stderr)
        sys.exit(2)


    for fd in filesordirs:
        if not os.path.exists(fd):
            raise Exception("No such file or directory" + fd)
        if os.path.isfile(fd):
            files.append(fd)
        elif os.path.isdir(fd):
            dirs = [fd]
            while dirs:
                dir = dirs.pop(0)
                for filename in glob.glob(dir + "/*"):
                    if os.path.isdir(filename):
                        dirs.append(filename)
                    elif filename.endswith('.folia.xml') or filename.endswith('.folia.xml.gz') or filename.endswith('.folia.xml.bz2'):
                        files.append(filename)


    for f in ('loadfile','loadfileleakbypass','readerwords'):
        if f in selectedtests or 'all' in selectedtests:
            for filename in files:
                globals()[f](filename=filename)


    for f in ('xml','text','json','countwords','selectwords','nextwords','ancestors','selectwordsfql','selectwordsfqlforp','selectwordsfqlxml','selectwordsfqlwhere','editwordsfql', 'addelement' ):
        if f in selectedtests or 'all' in selectedtests:
            for filename in files:
                doc = folia.Document(file=filename)
                globals()[f](doc=doc)

    for f in ('memtest',):
        if f in selectedtests or 'all' in selectedtests:
            for filename in files:
                doc = folia.Document(file=filename)
                print("memtest -- Memory test on document " + filename + " -- memory consumption estimated at " + str(round(asizeof.asizeof(doc) / 1024 / 1024,2)) + " MB" + " (filesize " + str(round(os.path.getsize(filename)/1024/1024,2)) + " MB)")
Example #4
0
def process(data):
    i, filename = data
    category = os.path.basename(os.path.dirname(filename))
    progress = round((i+1) / float(len(index)) * 100,1)    
    print("#" + str(i+1) + " " + filename + ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' +  str(progress) + '%',file=sys.stderr)
    try:
        doc = folia.Document(file=filename)
    except Exception as e:
        print("ERROR loading " + filename + ":" + str(e),file=sys.stderr)
        return False
    filename = filename.replace(sonardir,'')
    if filename[0] == '/':
        filename = filename[1:]
    if filename[-4:] == '.pos':
        filename = filename[:-4]
    if filename[-4:] == '.tok':
        filename = filename[:-4]    
    if filename[-4:] == '.ilk':
        filename = filename[:-4]    
    #Load document prior to tokenisation
    try:
        pretokdoc = folia.Document(file=sonardir + '/' + filename)
    except:
        print("WARNING unable to load pretokdoc " + filename,file=sys.stderr)
        pretokdoc = None
    if pretokdoc:
        for p2 in pretokdoc.paragraphs():
            try:
                p = doc[p2.id]        
            except:
                print("ERROR: Paragraph " + p2.id + " not found. Tokenised and pre-tokenised versions out of sync?",file=sys.stderr)
                continue
            if p2.text:
                p.text = p2.text                     
    try:
        os.mkdir(foliadir + os.path.dirname(filename))
    except:
        pass
        
    try:        
        doc.save(foliadir + filename)
    except:
        print("ERROR saving " + foliadir + filename,file=sys.stderr)
    
    try:
        f = codecs.open(foliadir + filename.replace('.xml','.tok.txt'),'w','utf-8')
        f.write(unicode(doc))    
        f.close()        
    except:
        print("ERROR saving " + foliadir + filename.replace('.xml','.tok.txt'),file=sys.stderr)

            
    sys.stdout.flush()
    sys.stderr.flush()
    return True
Example #5
0
def process(filename, outputfile=None):
    print("Processing " + filename, file=sys.stderr)
    count = Counter()
    try:
        doc = folia.Document(file=filename)
        count['documents'] += 1

        for e in doc.select(folia.AbstractElement):
            if e.XMLTAG and (not settings.types or e.XMLTAG in settings.types):
                count[e.XMLTAG] += 1

        for constraintag, constrainf in settings.constraints:
            if not constrainf(count[constraintag]):
                print("Skipping due to unmet constraints (" + constraintag +
                      "): " + filename,
                      file=sys.stderr)
                return Counter({'skipped_documents': 1})

        print("Counted " + filename, file=sys.stderr)

    except Exception as e:
        if settings.ignoreerrors:
            print("ERROR: An exception was raised whilst processing " +
                  filename + ":",
                  e,
                  file=sys.stderr)
        else:
            raise

    return count
Example #6
0
def foliamerge(outputfile, *files, **kwargs):
    asalternative = 'asalternative' in kwargs and kwargs['asalternative']
    outputdoc = None
    merges = 0

    for i, filename in enumerate(files):
        print("Processing " + filename, file=sys.stderr)
        inputdoc = folia.Document(file=filename)
        if i == 0:
            print("(pivot document)", file=sys.stderr)
            outputdoc = inputdoc
        else:
            print("(merging document)", file=sys.stderr)

            for annotationtype, set in inputdoc.annotations:
                if not outputdoc.declared(annotationtype, set):
                    outputdoc.declare(annotationtype, set)

            for e in inputdoc:
                merges += mergechildren(e, outputdoc, asalternative)

    if outputfile and merges > 0:
        outputdoc.save(outputfile)

    return outputdoc
Example #7
0
def compare(path,doc):
    convert(path,doc)
    ann = Annotations(path+doc)
    fdoc = folia.Document(file=path+doc+".xml")
    #test entities
    for ent in ann.get_textbounds():
        try:
            found=fdoc[ent.id]
            text = [str(a) for a in found.wrefs()]
            if ent.tail.strip() != " ".join(text):
                print "error: not found entity"
                print ent
                return False
        except KeyError:
            print "error: not found entity"
            print ent
            return False
    #test relations
    for rel in ann.get_relations():
        try:
            found=fdoc[rel.id]
            arefs = found.select(folia.AlignReference)
            if  not (any(a.id == rel.arg1 for a in arefs) and any(a.id == rel.arg2 for a in arefs)):
                print "error: not found relation"
                print rel
                return False
        except KeyError:
            print "error: not found relation"
            print rel
            return False
    #test events
    for event in ann.get_events():
        try:
            found=fdoc[event.id]
            arefs = found.select(folia.AlignReference)
            for role,rid in event.args:
                if  not any(a.id == rid for a in arefs) :
                    print "error: not found relation"
                    print rel
                    return False
        except KeyError:
            print "error: not found relation"
            print rel
            return False
    #test attributes
    for attr in ann.get_attributes():
        try:
            found=fdoc[attr.target]
            if  not any(fattr.cls == str(attr.value) and fattr.subset == attr.type for fattr in found.select(folia.Feature)) :
                print "error: not found attr"
                print attr
                print
                return False
        except KeyError:
            print "error: not found attr"
            print rel
            return False

    print "file "+path+doc+" is OK"
    return True
Example #8
0
def makefoliadoc(outputfile):
    baseid = os.path.basename(outputfile).replace('.folia.xml',
                                                  '').replace('.xml', '')
    foliadoc = folia.Document(id=baseid)
    foliadoc.append(folia.Text(foliadoc, id=baseid + '.text'))

    if not foliadoc.declared(folia.AnnotationType.TOKEN, 'alpino-tokens'):
        foliadoc.declare(folia.AnnotationType.TOKEN, 'alpino-tokens')
    if not foliadoc.declared(folia.LemmaAnnotation, 'alpino-lemmas'):
        foliadoc.declare(folia.LemmaAnnotation, 'alpino-lemmas')
    if not foliadoc.declared(folia.SenseAnnotation, 'alpino-sense'):
        foliadoc.declare(folia.SenseAnnotation, 'alpino-sense')
    if not foliadoc.declared(folia.PosAnnotation, 'alpino-pos'):
        foliadoc.declare(folia.PosAnnotation, 'alpino-pos')
    if not foliadoc.declared(folia.AnnotationType.DEPENDENCY,
                             'alpino-dependency'):
        foliadoc.declare(folia.AnnotationType.DEPENDENCY, 'alpino-dependency')
    if not foliadoc.declared(folia.AnnotationType.SYNTAX, 'alpino-syntax'):
        foliadoc.declare(folia.AnnotationType.SYNTAX, 'alpino-syntax')
    if not foliadoc.declared(folia.AnnotationType.MORPHOLOGICAL,
                             'alpino-morphology'):
        foliadoc.declare(folia.AnnotationType.MORPHOLOGICAL,
                         'alpino-morphology')

    return foliadoc
Example #9
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "-h", ["help"])
    except getopt.GetoptError as err:
        print(str(err), file=sys.stderr)
        usage()
        sys.exit(2)

    for o, a in opts:
        if o == '-h' or o == '--help':
            usage()
            sys.exit(0)
        else:
            raise Exception("No such option: " + o)

    if len(args) < 2:
        usage()
        sys.exit(2)
    else:
        alpinofiles = []
        for i, arg in enumerate(args):
            if i < len(args) - 1:
                alpinofiles.append(arg)
        foliafile = args[-1]

    if os.path.exists(foliafile):
        doc = folia.Document(file=foliafile)
    else:
        doc = makefoliadoc(foliafile)
    for alpinofile in alpinofiles:
        doc = alpino2folia(alpinofile, doc)
    doc.save(foliafile)
Example #10
0
def convert(f_i, f_o=None):
    """
    f_i/f_o: input/output file name/path without extension (str)
    ...
    """
    doc_i = Eaf(''.join([f_i, '.eaf']))

    if not f_o:
        f_o = f_i

    # https://pynlpl.readthedocs.io/en/latest/folia.html#editing-folia
    # https://pynlpl.readthedocs.io/en/latest/folia.html#adding-structure
    # https://pynlpl.readthedocs.io/en/latest/folia.html#structure-annotation-types
    print(os.path.basename(f_o))
    doc_o = folia.Document(id=os.path.basename(f_o))
    # https://github.com/proycon/folia/blob/master/foliatools/conllu2folia.py
    doc_o.declare(folia.LemmaAnnotation,
                  set=SET_LEMMA_MYSTEM,
                  annotator="Mystem")
    doc_o.declare(folia.PosAnnotation, set=SET_POS_MYSTEM, annotator="Mystem")
    doc_o.declare(folia.PosAnnotation, set=SET_POS, annotator="BiRCh group")
    doc_o.declare(folia.SyntacticUnit, set=SET_SU, annotator="BiRCh group")
    speech = doc_o.append(folia.Speech)
    for aa in create_conversation(get_aas(doc_i)):
        utterance = speech.append(folia.Utterance,
                                  id=aa[0],
                                  speaker=aa[1],
                                  begintime=aa[2],
                                  endtime=aa[3])

        # https://docs.python.org/3/library/string.html#formatspec
        #utterance.append(folia.Word,'{:10}:'.format(aa[1]))
        utterance.append(folia.Word, '{}:'.format(aa[1].upper()))
        for w in get_tokens(aa[4]):
            # handle visibility of tokens in the form of tags
            if len(w) > 1 and w[0] == '<' and w[1] != '$':
                #print(w)
                w = '<$' + w[1:]
            token = utterance.append(folia.Word, w)
            if is_token_mystem(w):
                analysis_mystem = m.analyze(w)[0]['analysis']
                if analysis_mystem:
                    # mystem's lexeme -> lemma annotation (???)
                    if 'lex' in analysis_mystem[0]:
                        token.append(folia.LemmaAnnotation,
                                     cls=analysis_mystem[0]['lex'],
                                     set=SET_LEMMA_MYSTEM)
                    if 'gr' in analysis_mystem[0]:
                        pos_plus = analysis_mystem[0]['gr'].strip()
                        pos, features = analyze_mystem_gr(pos_plus)
                        an_pos = token.append(folia.PosAnnotation,
                                              head=pos,
                                              cls=pos_plus,
                                              set=SET_POS_MYSTEM)
                        # https://pynlpl.readthedocs.io/en/latest/folia.html#features
                        an_pos.append(folia.Feature,
                                      subset='all',
                                      cls=features)

    doc_o.save(''.join([f_i, '.folia.xml']))
Example #11
0
def process_file(csv_writer, filename):
    """
    Reads a single FoLiA .xml-file, loops over its Sentences,
    and writes the annotations (Correction/SemanticRole) to the csv file.
    """
    doc = folia.Document(file=filename)

    for sentence in doc.sentences():
        sentence_nr = sentence.id.split('.')[-1]

        # Add Corrections on Sentence and Word level
        csv_writer.writerows(get_corrections(sentence, doc.id, sentence_nr))
        for word in sentence.words():
            csv_writer.writerows(get_corrections(word, doc.id, sentence_nr))

        # Add SemanticRoles
        for semrole in sentence.select(folia.SemanticRole):
            problem = get_feature(semrole, 'problem')
            pos = get_feature(semrole, 'pos')
            try:
                s_original = sentence.text(
                    correctionhandling=folia.CorrectionHandling.ORIGINAL)
                s_corrected = sentence.text(
                    correctionhandling=folia.CorrectionHandling.CURRENT)
            except folia.NoSuchText:
                s_original = ''
                s_corrected = ''
            csv_writer.writerow([
                doc.id, sentence_nr,
                get_text_from_semrole(semrole), '', semrole.cls, problem, pos,
                s_original, s_corrected
            ])
Example #12
0
def process(filename, queries):
    try:
        print("Processing " + filename, file=sys.stderr)
        doc = folia.Document(file=filename)
        dosave = False
        for query in queries:
            if query.format == "python":
                query.format = "xml"
            output = query(doc)
            print(output)
            if query.action and query.action.action in ('EDIT', 'DELETE',
                                                        'SUBSTITUTE',
                                                        'PREPEND', 'APPEND'):
                dosave = True
        #save document if changes are made
        if dosave:
            print("Saving " + filename, file=sys.stderr)
            doc.save()
    except Exception as e:
        if settings.ignoreerrors:
            print("ERROR: An exception was raised whilst processing " +
                  filename + ":",
                  e,
                  file=sys.stderr)
        else:
            raise
def folia_docnameetypewords2file(inpath, outpath):
    outfile = open(outpath, 'w')
    ids = []
    sentences_num = 0
    if os.path.isdir(inpath):
        for filename in os.listdir(inpath):
            doc = folia.Document(file=inpath + '/' + filename)
            if filename == "https__timesofindia.indiatimes.com_city_hyderabad_1st-anniversary-of-anti-power-hike-rally_articleshow_727307023.folia.xml":
                print("Here")
            docnamewritten = False
            for h, sentence in enumerate(doc.sentences()):
                for layer in sentence.select(folia.EntitiesLayer):
                    for i, entity in enumerate(layer.select(folia.Entity)):
                        if entity.cls == 'etype':
                            if not docnamewritten:
                                outfile.write('\n' + filename + '\n')
                                docnamewritten = True
                            sentence_tokenized = sentence.select(folia.Word)
                            words_folia = list(sentence_tokenized)
                            word_classes = [w.cls for w in words_folia]
                            if 'URL' in word_classes:
                                continue
                            sentences_num += 1
                            for word in entity.wrefs():
                                word_text = word.text()
                                outfile.write(word_text + '\n')

    else:
        print("TODO: Handling of a single Folia file instead of a folder of Folia files.")
    outfile.close()
def folia_sentences2file(inpath, outpath):
    outfile = open(outpath, 'w')
    ids = []
    sentences_num = 0
    if os.path.isdir(inpath):
        for filename in os.listdir(inpath):
            doc = folia.Document(file=inpath + '/' + filename)
            for h, sentence in enumerate(doc.sentences()):
                sentence_tokenized = sentence.select(folia.Word)
                words_folia = list(sentence_tokenized)
                word_classes = [w.cls for w in words_folia]
                if 'URL' in word_classes:
                    continue
                sentences_num += 1
                for i,word in enumerate(words_folia):
                    w_id = word.id
                    w_text = word.text()
                    if w_id in ids:
                        continue
                    if w_text == '<P>':
                        continue
                    ids.append(w_id)
                    # word.next() if NoneType then it means <entities> tag is hit. Now it is time for newline.
                    # word.next() check is necessary for sentences having entities tagged. len(words_folia) check does not do in that case. It
                    # counts wrefs inside the entities as well as w as words.
                    if (not word.next()) or i + 1 == len(words_folia):
                        outfile.write(w_text.lower() + '\n')
                    else:
                        outfile.write(w_text.lower() + ' ')
    else:
        print("TODO: Handling of a single Folia file instead of a folder of Folia files.")
    outfile.close()
Example #15
0
def process(target):
    print "Processing " + target
    if os.path.isdir(target):
        print "Descending into directory " + target
        for f in glob.glob(target + '/*'):
            process(f)
    elif os.path.isfile(target) and target[-4:] == '.xml':            
        print "Loading " + target
        try:
            doc = folia.Document(file=target)
        except lxml.etree.XMLSyntaxError:
            print >>sys.stderr, "UNABLE TO LOAD " + target + " (XML SYNTAX ERROR!)"
            return None
        changed = False
        for word in doc.words():
            try:
                pos = word.annotation(folia.PosAnnotation)                
            except folia.NoSuchAnnotation:
                continue
            try:
                word.replace( cgn.parse_cgn_postag(pos.cls) )
                changed = True
            except cgn.InvalidTagException:
                print >>sys.stderr, "WARNING: INVALID TAG " + pos.cls
                continue
        if changed:
            print "Saving..."
            doc.save()
Example #16
0
 def __get_folia_doc__(self, tokens):
     doc = folia.Document(id='nltk-sentence')
     folia_sent = doc.add(folia.Text)
     for tok, pos in tokens:
         word = folia_sent.add(folia.Word, tok)
         word.add(folia.PosAnnotation(None, set='custom', cls=pos))
     return doc
Example #17
0
    def upload(self, *namespaceargs):
        namespace = validatenamespace('/'.join(namespaceargs))
        log("In upload, namespace=" + namespace)
        response = {'version':VERSION}
        cl = cherrypy.request.headers['Content-Length']
        data = cherrypy.request.body.read(int(cl))
        cherrypy.response.headers['Content-Type'] = 'application/json'
        #data =cherrypy.request.params['data']
        try:
            log("Loading document from upload")
            doc = folia.Document(string=data,setdefinitions=self.docstore.setdefinitions, loadsetdefinitions=True)
            if not self.allowtextredundancy:
                for e in doc.data:
                    cleantextredundancy(e)
            doc.changed = True
            response['docid'] = doc.id
            self.docstore[(namespace,doc.id)] = doc
        except Exception as e:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            formatted_lines = traceback.format_exc().splitlines()
            traceback.print_tb(exc_traceback, limit=50, file=sys.stderr)
            response['error'] = "Uploaded file is no valid FoLiA Document: " + str(e) + " -- " "\n".join(formatted_lines)
            log(response['error'])
            if logfile: traceback.print_tb(exc_traceback, limit=50, file=logfile)
            return json.dumps(response).encode('utf-8')

        filename = self.docstore.getfilename( (namespace, doc.id))
        i = 1
        while os.path.exists(filename):
            filename = self.docstore.getfilename( (namespace, doc.id + "." + str(i)))
            i += 1
        self.docstore.save((namespace,doc.id), "Initial upload")
        return json.dumps(response).encode('utf-8')
Example #18
0
def process(filename):
    print >> sys.stderr, "Processing " + filename
    doc = folia.Document(file=filename)

    freqlist = FrequencyList()

    if settings.n == 1:
        for word in doc.words():
            text = word.toktext()
            if settings.casesensitive: text = text.lower()
            freqlist.count(text)
    elif settings.sentencemarkers:
        for sentence in doc.sentences():
            for ngram in Windower(sentence.words(), settings.n):
                text = ' '.join([x for x in ngram.toktext()])
                if settings.casesensitive: text = text.lower()
                freqlist.count(text)
    else:
        for word in Windower(sentence.words(), settings.n, None, None):
            text = ' '.join([x for x in ngram.toktext()])
            if settings.casesensitive: text = text.lower()
            freqlist.count(text)

    if settings.autooutput:
        if filename[-len(settings.extension) -
                    1:].lower() == '.' + settings.extension:
            outfilename = filename[:-len(settings.extension) - 1] + '.freqlist'
        else:
            outfilename += '.freqlist'
        freqlist.save(outfilename, True)

    return freqlist
Example #19
0
def process_file(filename):
    """
    Reads a single FoLiA .xml-file, loops over its Sentences,
    and writes the text to the csv file.
    """
    bname, _ = os.path.splitext(os.path.basename(filename))
    dname = os.path.dirname(filename)
    orig_name = os.path.join(dname, 'orig_{}.txt'.format(bname))
    corr_name = os.path.join(dname, 'corr_{}.txt'.format(bname))

    with codecs.open(orig_name, 'wb', 'utf-8') as orig_file:
        with codecs.open(corr_name, 'wb', 'utf-8') as corr_file:
            doc = folia.Document(file=filename)

            for sentence in doc.sentences():
                try:
                    s_original = sentence.text(
                        correctionhandling=folia.CorrectionHandling.ORIGINAL)
                    s_corrected = sentence.text(
                        correctionhandling=folia.CorrectionHandling.CURRENT)
                except folia.NoSuchText:
                    s_original = ''
                    s_corrected = ''

                orig_file.write(replace_specials(s_original))
                orig_file.write('\n')

                corr_file.write(replace_specials(s_corrected))
                corr_file.write('\n')
Example #20
0
def cql_search(request):
    from pynlpl.formats import fql, cql
    # парсинг входящих параметров
    params = json.loads(request.body.decode('utf-8'))
    # обновление фолиа-документа по актуальным данным
    doc = folia.Document(id='doc')
    text = folia.Text(doc, id='doc.text')
    sentences = Sentence.objects.all()
    # поиск слов в документе
    for s in sentences:
        sen = text.append(folia.Sentence(doc, id=doc.id + '.s.' + str(s.id)))
        words = Word.objects.filter(Sentence_id=s.id)
        for w in words:
            sen.append(
                folia.Word(doc,
                           id=doc.id + '.s.' + str(s.id) + '.w.' + str(w.id),
                           text=w.value))
    doc.append(text)
    query = fql.Query(cql.cql2fql(params['title']))
    texts = query(doc)
    arr = []
    for t in texts:
        arr.append(t[0].parent.id.split('s.')[1])
    sens = Sentence.objects.filter(id__in=arr)
    # вывод результатов
    return render(request, 'cabinet/cql_results.html', {
        'texts': texts,
        'sens': sens
    })
Example #21
0
 def load(self,key, forcereload=False):
     if key[0] == "testflat": key = ("testflat", "testflat")
     self.use(key)
     filename = self.getfilename(key)
     if time.time() - self.lastunloadcheck > 900: #no unload check for 15 mins? background thread seems to have crashed?
         self.fail = True #trigger lockdown
         self.forceunload() #force unload of everything
         raise NoSuchDocument("Document Server is in lockdown due to loss of contact with autoupdater thread, refusing to process new documents...")
     if key not in self or forcereload:
         if not os.path.exists(filename):
             log("File not found: " + filename)
             self.done(key)
             raise NoSuchDocument
         if self.fail and not self.ignorefail:
             raise NoSuchDocument("Document Server is in lockdown due to earlier failure during XML serialisation, refusing to process new documents...")
         log("Loading " + filename)
         try:
             self.data[key] = folia.Document(file=filename, setdefinitions=self.setdefinitions, loadsetdefinitions=True)
             self.data[key].changed = False
         except Exception as e:
             exc_type, exc_value, exc_traceback = sys.exc_info()
             traceback.print_tb(exc_traceback, limit=50, file=sys.stderr)
             log("ERROR reading file " + filename + ": " + str(e))
             if logfile: traceback.print_tb(exc_traceback, limit=50, file=logfile)
             self.done(key)
             raise
         self.lastaccess[key]['NOSID'] = time.time()
     self.done(key)
     return self.data[key]
Example #22
0
def correct(filename, corrected, original, acceptsuggestion, setfilter,
            classfilter, output):
    changed = False
    try:
        doc = folia.Document(file=filename)
        for text in doc:
            for correction in list(text.select(folia.Correction, setfilter)):
                if not classfilter or correction.cls == classfilter:
                    if original:
                        if correction.hasoriginal():
                            #restore original
                            print("Restoring original version for " +
                                  str(correction.id),
                                  file=sys.stderr)
                            replace(correction, correction.original())
                            changed = True
                        elif correction.hasoriginal(
                                True):  #check for empty original
                            #insertion, remove it
                            correction.parent.remove(correction)
                    elif corrected:
                        if correction.hasnew():
                            print("Keeping corrected version for " +
                                  str(correction.id),
                                  file=sys.stderr)
                            replace(correction, correction.new())
                            changed = True
                    elif correction.hassuggestions() and acceptsuggestion:
                        bestsuggestion = None
                        changed = True
                        for suggestion in correction.hassuggestions():
                            if not bestsuggestion or (
                                    suggestion.confidence
                                    and not bestsuggestion.confidence) or (
                                        suggestion.confidence
                                        and bestsuggestion.confidence
                                        and suggestion.confidence >
                                        bestsuggestion.confidence):
                                bestsuggestion = suggestion
                        if bestsuggestion:
                            if corrected:
                                replace(correction, bestsuggestion)
                            else:
                                raise NotImplementedError  #TODO
                    if output:
                        print(correction.xmlstring())
        if changed:
            if settings.stdout:
                print(doc.xmlstring())
            else:
                doc.save()
    except Exception as e:
        if settings.ignoreerrors:
            print("ERROR: An exception was raised whilst processing " +
                  filename + ":",
                  e,
                  file=sys.stderr)
        else:
            raise
Example #23
0
def build_spacy_doc(folia_file):
    try:
        folia_doc = folia.Document(file=folia_file)
        nlp = spacy.load('en')
        doc = nlp(folia_doc.text())
        return (doc)
    except:
        print("Error occurred while processing " + folia_file)
def folia_creator(id, data, docname):
    doc = folia.Document(id=id)
    for i in range(0, len(data)):
        text = doc.add(folia.Text)
        text.add(folia.Word, data[str(i)]['value'])
        text.add(folia.Sentence, data[str(i)]['text'])
    doc.save('./files/' + docname)
    return True
Example #25
0
def getFoliaTokens(sentenceId):
    corpus = []
    filepath = "data/EIFD-FlatData/" + sentenceId.split('.')[0] + ".folia.xml"
    doc = folia.Document(file=filepath)
    sentence = doc[sentenceId]
    for word in sentence.words():
        corpus.append(word.text())
    return corpus
def main():
    parser = argparse.ArgumentParser(description="Convert FoLiA to JSON Shared Task format", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-C',dest='corrections',help="Strip corrections", action='store_false',default=True)
    parser.add_argument('file', nargs=1, help='FoLiA Document (input)')
    args = parser.parse_args()

    doc = folia.Document(file=args.file[0])
    data = folia2json(doc, args.corrections)
    print(json.dumps(data, ensure_ascii=False, indent=4))
Example #27
0
def process(filename, patterns):
    print >> sys.stderr, "Processing " + filename
    doc = folia.Document(file=filename)
    for match in doc.findwords(*patterns):
        s = u""
        for token in match:
            s += u"\t" + token.text()
        s = filename + "\t" + match[0].id + s
        print s.encode(settings.encoding)
Example #28
0
        def buildfromfolia(self, files, encoding='utf-8'):
            freqlist = FrequencyList()
            if isinstance(files, str): files = [files]
            for filename in files:
                f = folia.Document(file=filename)
                for sentence in f.sentences():
                    tokens = sentence.toktext().split(' ')
                    freqlist.append(tokens)

            self.buildfromfreqlist(freqlist)
Example #29
0
def get_annotations(first, second, adjudication_path):
    os.chdir(first)
    filename = []
    for elem in glob.glob("*.xml"):
        filename.append(elem)

    filetext = []
    for elem in glob.glob("*.xml"):
        doc = folia.Document(file=elem)
        filetext.append(doc.text())

    first_annotations = []
    for elem in glob.glob("*.xml"):
        doc = folia.Document(file=elem)
        try:
            first_annotations.append(doc.metadata.data['Violent'])
        except:
            first_annotations.append("Annotation Empty")
    os.chdir(second)
    second_annotations = []
    for elem in glob.glob("*.xml"):
        doc = folia.Document(file=elem)
        try:
            second_annotations.append(doc.metadata.data['Violent'])
        except:
            second_annotations.append("Annotation Empty")

    adjudication = []
    os.chdir(adjudication_path)
    for elem in filename:
        doc = folia.Document(file=elem)
        try:
            adjudication.append(doc.metadata.data['Violent'])
        except:
            adjudication.append("Adjudication Empty")

    zip_list = list(
        zip(filename, filetext, first_annotations, second_annotations,
            adjudication))
    df = pd.DataFrame(
        zip_list,
        columns=["Filename", "File Text", "First", "Second", "Adjudication"])
    return df
def folia2sentences(path, tagFormat):
    sentences_as_tokens = []
    ids = []
    id2idx = {}
    idx2id = {}
    all_tokens = []
    actual_tags = []
    if os.path.isdir(path):
        idx = -1
        for filename in os.listdir(path):
            doc = folia.Document(file=path + '/' + filename)
            for h, sentence in enumerate(doc.sentences()):
                sentence_tokenized = sentence.select(folia.Word)
                words_folia = list(sentence_tokenized)
                sentence_tokens = []
                for word in words_folia:
                    w_id = word.id
                    w_text = word.text()
                    if w_id in ids:
                        continue
                    idx = idx + 1
                    if w_text == '<P>':
                        idx = idx - 1
                        continue
                    #if w_text == 'krishnappa':
                    #   idx = idx - 1
                    #  continue
                    ids.append(w_id)
                    id2idx[w_id] = idx
                    idx2id[idx] = w_id
                    actual_tags.append('O')
                    sentence_tokens.append(w_text)
                    all_tokens.append(w_text)

                sentences_as_tokens.append(sentence_tokens)
                for layer in sentence.select(folia.EntitiesLayer):
                    for entity in layer.select(folia.Entity):
                        for word in entity.wrefs():
                            word_id = word.id
                            _idx = id2idx[word_id]
                            if tagFormat == 'stanford':
                                tag = foliaclass2stanfordtag(entity)
                            elif tagFormat == 'conll':
                                print(
                                    'TODO: reuse codes that output files to output objects instead.'
                                )
                            elif tagFormat == 'raw':
                                tag = foliaclass2rawtag(entity)
                            actual_tags[_idx] = tag

    else:
        print(
            "TODO: Handling of a single Folia file instead of a folder of Folia files."
        )
    return [sentences_as_tokens, all_tokens, actual_tags]