Beispiel #1
0
    def __iter__(self):
        self.f.seek(0)
        nextlinebuffer = u(next(self.f))
        sentenceindex = 0

        done = False
        while not done:
            sentenceindex += 1
            line = nextlinebuffer
            if line[0] != "#":
                raise Exception(
                    "Error parsing GIZA++ Alignment at sentence "
                    + str(sentenceindex)
                    + ", expected new fragment, found: "
                    + repr(line)
                )

            targetline = u(next(self.f))
            sourceline = u(next(self.f))

            yield GizaSentenceAlignment(sourceline, targetline, sentenceindex)

            try:
                nextlinebuffer = u(next(self.f))
            except StopIteration:
                done = True
Beispiel #2
0
    def process(self,input_data, source_encoding="utf-8", return_unicode = True, oldfrog=False):
        """Receives input_data in the form of a str or unicode object, passes this to the server, with proper consideration for the encodings, and returns the Frog output as a list of tuples: (word,pos,lemma,morphology), each of these is a proper unicode object unless return_unicode is set to False, in which case raw strings will be returned. Return_unicode is no longer optional, it is fixed to True, parameter is still there only for backwards-compatibility."""
        if isinstance(input_data, list) or isinstance(input_data, tuple):
            input_data = " ".join(input_data)



        input_data = u(input_data, source_encoding) #decode (or preferably do this in an earlier stage)
        input_data = input_data.strip(' \t\n')

        s = input_data.encode(self.server_encoding) +b'\r\n'
        if not oldfrog: s += b'EOT\r\n'
        self.socket.sendall(s) #send to socket in desired encoding
        output = []

        done = False
        while not done:
            data = b""
            while not data.endswith(b'\n'):
                moredata = self.socket.recv(self.BUFSIZE)
                if not moredata: break
                data += moredata


            data = u(data,self.server_encoding)


            for line in data.strip(' \t\r\n').split('\n'):
                if line == "READY":
                    done = True
                    break
                elif line:
                    line = line.split('\t') #split on tab
                    if len(line) > 4 and line[0].isdigit(): #first column is token number
                        if line[0] == '1' and output:
                            if self.returnall:
                                output.append( (None,None,None,None, None,None,None, None) )
                            else:
                                output.append( (None,None,None,None) )
                        fields = line[1:]
                        parse1=parse2=ner=chunk=""
                        word,lemma,morph,pos = fields[0:4]
                        if len(fields) > 5:
                            ner = fields[5]
                        if len(fields) > 6:
                            chunk = fields[6]
                        if len(fields) >= 8:
                            parse1 = fields[7]
                            parse2 = fields[8]

                        if len(fields) < 5:
                            raise Exception("Can't process response line from Frog: ", repr(line), " got unexpected number of fields ", str(len(fields) + 1))

                        if self.returnall:
                            output.append( (word,lemma,morph,pos,ner,chunk,parse1,parse2) )
                        else:
                            output.append( (word,lemma,morph,pos) )

        return output
Beispiel #3
0
    def process(self, input_data, source_encoding="utf-8", return_unicode=True, oldfrog=False):
        """Receives input_data in the form of a str or unicode object, passes this to the server, with proper consideration for the encodings, and returns the Frog output as a list of tuples: (word,pos,lemma,morphology), each of these is a proper unicode object unless return_unicode is set to False, in which case raw strings will be returned. Return_unicode is no longer optional, it is fixed to True, parameter is still there only for backwards-compatibility."""
        if isinstance(input_data, list) or isinstance(input_data, tuple):
            input_data = " ".join(input_data)

        input_data = u(input_data, source_encoding)  # decode (or preferably do this in an earlier stage)
        input_data = input_data.strip(" \t\n")

        s = input_data.encode(self.server_encoding) + b"\r\n"
        if not oldfrog:
            s += b"EOT\r\n"
        self.socket.sendall(s)  # send to socket in desired encoding
        output = []

        done = False
        while not done:
            data = b""
            while not data.endswith(b"\n"):
                moredata = self.socket.recv(self.BUFSIZE)
                if not moredata:
                    break
                data += moredata

            data = u(data, self.server_encoding)

            for line in data.strip(" \t\r\n").split("\n"):
                if line == "READY":
                    done = True
                    break
                elif line:
                    line = line.split("\t")  # split on tab
                    if len(line) > 4 and line[0].isdigit():  # first column is token number
                        if line[0] == "1" and output:
                            if self.returnall:
                                output.append((None, None, None, None, None, None, None, None))
                            else:
                                output.append((None, None, None, None))
                        fields = line[1:]
                        parse1 = parse2 = ner = chunk = ""
                        word, lemma, morph, pos = fields[0:4]
                        if len(fields) > 5:
                            ner = fields[5]
                        if len(fields) > 6:
                            chunk = fields[6]

                        if len(fields) < 5:
                            raise Exception(
                                "Can't process response line from Frog: ",
                                repr(line),
                                " got unexpected number of fields ",
                                str(len(fields) + 1),
                            )

                        if self.returnall:
                            output.append((word, lemma, morph, pos, ner, chunk, parse1, parse2))
                        else:
                            output.append((word, lemma, morph, pos))

        return output
Beispiel #4
0
 def __str__(self):
     if not self.computed: self.compute()
     o =  "%-15s TP\tFP\tTN\tFN\tAccuracy\tPrecision\tRecall(TPR)\tSpecificity(TNR)\tF-score\n" % ("")
     for cls in sorted(set(self.classes)):
         cls = u(cls)
         o += "%-15s %d\t%d\t%d\t%d\t%4f\t%4f\t%4f\t%4f\t%4f\n" % (cls, self.tp[cls], self.fp[cls], self.tn[cls], self.fn[cls], self.accuracy(cls), self.precision(cls), self.recall(cls),self.specificity(cls),  self.fscore(cls) )
     return o + "\n" + self.outputmetrics()
Beispiel #5
0
    def startcommand(self, command, cwd, stdout, stderr, *arguments, **parameters):
        argdelimiter=' '
        printcommand = True

        cmd = command
        if arguments:
            cmd += ' ' + " ".join([ u(x) for x in arguments])
        if parameters:
            for key, value in parameters.items():
                if key == 'argdelimiter':
                    argdelimiter = value
                elif key == 'printcommand':
                    printcommand = value
                elif isinstance(value, bool) and value == True:
                    cmd += ' ' + key
                elif key[-1] != '=':
                    cmd += ' ' + key + argdelimiter + str(value)
                else:
                    cmd += ' ' + key + str(value)
        if printcommand:
            print("STARTING COMMAND: " + cmd, file=stderr)

        self.begintime = datetime.datetime.now()
        if not cwd:
            self.process = subprocess.Popen(cmd, shell=True,stdout=stdout,stderr=stderr)
        else:
            self.process = subprocess.Popen(cmd, shell=True,cwd=cwd,stdout=stdout,stderr=stderr)
        #pid = process.pid
        #os.waitpid(pid, 0) #wait for process to finish
        return self.process
Beispiel #6
0
 def __str__(self):
     if not self.computed: self.compute()
     o =  "%-15s TP\tFP\tTN\tFN\tAccuracy\tPrecision\tRecall(TPR)\tSpecificity(TNR)\tF-score\n" % ("")
     for cls in sorted(set(self.classes)):
         cls = u(cls)
         o += "%-15s %d\t%d\t%d\t%d\t%4f\t%4f\t%4f\t%4f\t%4f\n" % (cls, self.tp[cls], self.fp[cls], self.tn[cls], self.fn[cls], self.accuracy(cls), self.precision(cls), self.recall(cls),self.specificity(cls),  self.fscore(cls) )
     return o + "\n" + self.outputmetrics()
Beispiel #7
0
    def __getitem__(self, phrase):
        solutions = []
        if phrase != self.lastquery:
            self.socket.send(phrase + "\r\n")

            data = b""
            while not data or data[-1] != '\n':
                data += self.socket.recv(self.BUFSIZE)
        else:
            data = self.lastresponse

        data = u(data)

        for line in data.split('\n'):
            line = line.strip('\r\n')
            if line == "NOTFOUND":
                raise KeyError(phrase)
            elif line:
                fields = tuple(line.split("\t"))
                if len(fields) == 4:
                    solutions.append(fields)
                else:
                    print >> sys.stderr, "PHRASETABLECLIENT WARNING: Unable to parse response line"

        self.lastresponse = data
        self.lastquery = phrase

        return solutions
Beispiel #8
0
    def __getitem__(self, phrase):
        solutions = []        
        if phrase != self.lastquery:
            self.socket.send(phrase+ "\r\n")
                    
            data = b""
            while not data or data[-1] != '\n':
                data += self.socket.recv(self.BUFSIZE)
        else:
            data = self.lastresponse

        data = u(data)

        for line in data.split('\n'):
            line = line.strip('\r\n')
            if line == "NOTFOUND":
                raise KeyError(phrase)
            elif line:
                fields = tuple(line.split("\t"))
                if len(fields) == 4:
                    solutions.append( fields )
                else:
                    print >>sys.stderr,"PHRASETABLECLIENT WARNING: Unable to parse response line"
                    
        self.lastresponse = data
        self.lastquery = phrase
                            
        return solutions
Beispiel #9
0
    def startcommand(self, command, cwd, stdout, stderr, *arguments, **parameters):
        argdelimiter=' '
        printcommand = True

        cmd = command
        if arguments:
            cmd += ' ' + " ".join([ u(x) for x in arguments])
        if parameters:
            for key, value in parameters.items():
                if key == 'argdelimiter':
                    argdelimiter = value
                elif key == 'printcommand':
                    printcommand = value
                elif isinstance(value, bool) and value == True:
                    cmd += ' ' + key
                elif key[-1] != '=':
                    cmd += ' ' + key + argdelimiter + str(value)
                else:
                    cmd += ' ' + key + str(value)
        if printcommand:
            print("STARTING COMMAND: " + cmd, file=stderr)

        self.begintime = datetime.datetime.now()
        if not cwd:
            self.process = subprocess.Popen(cmd, shell=True,stdout=stdout,stderr=stderr)
        else:
            self.process = subprocess.Popen(cmd, shell=True,cwd=cwd,stdout=stdout,stderr=stderr)
        #pid = process.pid
        #os.waitpid(pid, 0) #wait for process to finish
        return self.process
Beispiel #10
0
    def process(self, sourcewords, debug=False):
        """Process a list of words, passing it to the server and realigning the output with the original words"""

        if isinstance( sourcewords, list ) or isinstance( sourcewords, tuple ):
            sourcewords_s = " ".join(sourcewords)            
        else:
            sourcewords_s = sourcewords
            sourcewords = sourcewords.split(' ')
        
        self.socket.sendall(sourcewords_s.encode(self.encoding) +'\n\0')
        if debug: print("Sent:",sourcewords_s.encode(self.encoding),file=sys.stderr)
        
        results = []
        done = False
        while not done:    
            data = b""
            while not data:
                buffer = self.socket.recv(self.BUFSIZE)
                if debug: print("Buffer: ["+repr(buffer)+"]",file=sys.stderr)
                if buffer[-1] == '\0':
                    data += buffer[:-1]
                    done = True
                    break
                else:
                    data += buffer

            
            data = u(data,self.encoding)
            if debug: print("Received:",data,file=sys.stderr) 

            for i, line in enumerate(data.strip(' \t\0\r\n').split('\n')):
                if not line.strip():
                    done = True
                    break
                else:
                    cols = line.split(" ")
                    subwords = cols[0].lower().split("_")
                    if len(cols) > 2: #this seems a bit odd?
                        for word in subwords: #split multiword expressions
                            results.append( (word, cols[1], cols[2], i, len(subwords) > 1 ) ) #word, lemma, pos, index, multiword?

        sourcewords = [ w.lower() for w in sourcewords ]          

        alignment = []
        for i, sourceword in enumerate(sourcewords):
            found = False
            best = 0  
            distance = 999999          
            for j, (targetword, lemma, pos, index, multiword) in enumerate(results):
                if sourceword == targetword and abs(i-j) < distance:
                    found = True
                    best = j
                    distance = abs(i-j)

            if found:
                alignment.append(results[best])
            else:                
                alignment.append((None,None,None,None,False)) #no alignment found
        return alignment
Beispiel #11
0
 def output(self,delimiter = '\t', addnormalised=False):
     """Print a representation of the frequency list"""
     for type, count in self:
         if isinstance(type,tuple) or isinstance(type,list):
             if addnormalised:
                 yield " ".join((u(x) for x in type)) + delimiter + str(count) + delimiter + str(count/self.total)
             else:
                 yield " ".join((u(x) for x in type)) + delimiter + str(count)
         elif isstring(type):
             if addnormalised:
                 yield type + delimiter + str(count) + delimiter + str(count/self.total)
             else:
                 yield type + delimiter + str(count)
         else:
             if addnormalised:
                 yield str(type) + delimiter + str(count) + delimiter + str(count/self.total)
             else:
                 yield str(type) + delimiter + str(count)
Beispiel #12
0
    def __iter__(self):
        self.f.seek(0)
        nextlinebuffer = u(next(self.f))
        sentenceindex = 0

        done = False
        while not done:
            sentenceindex += 1
            line = nextlinebuffer
            if line[0] != '#':
                raise Exception("Error parsing GIZA++ Alignment at sentence " +  str(sentenceindex) + ", expected new fragment, found: " + repr(line))

            targetline = u(next(self.f))
            sourceline = u(next(self.f))

            yield GizaSentenceAlignment(sourceline, targetline, sentenceindex)

            try:
                nextlinebuffer = u(next(self.f))
            except StopIteration:
                done = True
Beispiel #13
0
 def output(self, delimiter='\t', addnormalised=False):
     """Print a representation of the frequency list"""
     for type, count in self:
         if isinstance(type, tuple) or isinstance(type, list):
             if addnormalised:
                 yield " ".join((u(x) for x in type)) + delimiter + str(
                     count) + delimiter + str(count / self.total)
             else:
                 yield " ".join(
                     (u(x) for x in type)) + delimiter + str(count)
         elif isstring(type):
             if addnormalised:
                 yield type + delimiter + str(count) + delimiter + str(
                     count / self.total)
             else:
                 yield type + delimiter + str(count)
         else:
             if addnormalised:
                 yield str(type) + delimiter + str(count) + delimiter + str(
                     count / self.total)
             else:
                 yield str(type) + delimiter + str(count)
Beispiel #14
0
    def __iter__(self):  # by Sander Canisius
        line = self.stream.readline()
        while line:
            assert line.startswith("#")
            src = self.stream.readline().split()
            trg = []
            alignment = [None for i in xrange(len(src))]

            for i, (targetWord, positions) in enumerate(parseAlignment(self.stream.readline().split())):

                trg.append(targetWord)

                for pos in positions:
                    assert alignment[pos - 1] is None
                    alignment[pos - 1] = i

            if self.encoding:
                yield [u(w, self.encoding) for w in src], [u(w, self.encoding) for w in trg], alignment
            else:
                yield src, trg, alignment

            line = self.stream.readline()
Beispiel #15
0
    def __iter__(self): #by Sander Canisius
        line = self.stream.readline()
        while line:
            assert line.startswith("#")
            src = self.stream.readline().split()
            trg = []
            alignment = [None for i in xrange(len(src))]

            for i, (targetWord, positions) in enumerate(parseAlignment(self.stream.readline().split())):

                trg.append(targetWord)

                for pos in positions:
                    assert alignment[pos - 1] is None
                    alignment[pos - 1] = i

            if self.encoding:
                yield [ u(w,self.encoding) for w in src ], [ u(w,self.encoding) for w in trg ], alignment
            else:
                yield src, trg, alignment

            line = self.stream.readline()
Beispiel #16
0
 def flush(sentences):
     if sentences:
         print("Processing " + str(len(sentences)) + " lines",file=stderr)                
         for sentence in sentences:
             out = ""
             p = subprocess.Popen([self.tagger], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             (results, err) = p.communicate("\n".join(sentences).encode('utf-8'))
             for line in results.split('\n'):
                 line = line.strip()
                 if line:
                     fields = line.split('\t')
                     word = fields[0]
                     pos = fields[1]
                     lemma = fields[2]
                     if oneperline:
                         if out: out += "\n"
                         out += word + "\t" + lemma + "\t" + pos
                     else: 
                         if out: out += " "
                         if '|' in word:
                             word = word.replace('|','_')
                         if '|' in lemma:
                             lemma = lemma.replace('|','_') 
                         if '|' in pos:
                             pos = pos.replace('|','_') 
                     out += word + "|" + lemma + "|" + pos
                     if pos[0] == '$':
                         out = u(out)
                         f_out.write(out + "\n")        
                         if oneperline: f_out.write("\n")
                         out = ""
                     
         if out:
            out = u(out)
            f_out.write(out + "\n")   
            if oneperline: f_out.write("\n")
Beispiel #17
0
    def __contains__(self, phrase):
        self.socket.send(phrase.encode('utf-8')+ b"\r\n")\

        data = b""
        while not data or data[-1] != '\n':
            data += self.socket.recv(self.BUFSIZE)

        data = u(data)

        for line in data.split('\n'):
            line = line.strip('\r\n')
            if line == "NOTFOUND":
                return False

        self.lastresponse = data
        self.lastquery = phrase

        return True
Beispiel #18
0
    def __contains__(self, phrase):
        self.socket.send(phrase.encode('utf-8')+ b"\r\n")\
        
        
        data = b""
        while not data or data[-1] != '\n':
            data += self.socket.recv(self.BUFSIZE)

        data = u(data)

        for line in data.split('\n'):
            line = line.strip('\r\n')
            if line == "NOTFOUND":
                return False
                
        self.lastresponse = data
        self.lastquery = phrase
        
        return True
    def process(self, sourcewords, debug=False):
        """Process a list of words, passing it to the server and realigning the output with the original words"""

        if isinstance(sourcewords, list) or isinstance(sourcewords, tuple):
            sourcewords_s = " ".join(sourcewords)
        else:
            sourcewords_s = sourcewords
            sourcewords = sourcewords.split(' ')

        self.socket.sendall(sourcewords_s.encode(self.encoding) + '\n\0')
        if debug:
            print("Sent:",
                  sourcewords_s.encode(self.encoding),
                  file=sys.stderr)

        results = []
        done = False
        while not done:
            data = b""
            while not data:
                buffer = self.socket.recv(self.BUFSIZE)
                if debug:
                    print("Buffer: [" + repr(buffer) + "]", file=sys.stderr)
                if buffer[-1] == '\0':
                    data += buffer[:-1]
                    done = True
                    break
                else:
                    data += buffer

            data = u(data, self.encoding)
            if debug: print("Received:", data, file=sys.stderr)

            for i, line in enumerate(data.strip(' \t\0\r\n').split('\n')):
                if not line.strip():
                    done = True
                    break
                else:
                    cols = line.split(" ")
                    subwords = cols[0].lower().split("_")
                    if len(cols) > 2:  #this seems a bit odd?
                        for word in subwords:  #split multiword expressions
                            results.append(
                                (word, cols[1], cols[2], i, len(subwords) >
                                 1))  #word, lemma, pos, index, multiword?

        sourcewords = [w.lower() for w in sourcewords]

        alignment = []
        for i, sourceword in enumerate(sourcewords):
            found = False
            best = 0
            distance = 999999
            for j, (targetword, lemma, pos, index,
                    multiword) in enumerate(results):
                if sourceword == targetword and abs(i - j) < distance:
                    found = True
                    best = j
                    distance = abs(i - j)

            if found:
                alignment.append(results[best])
            else:
                alignment.append(
                    (None, None, None, None, False))  #no alignment found
        return alignment
Beispiel #20
0
 def _sanitize(self, word_id):
     return u(word_id)
Beispiel #21
0
    def __init__(self,filename, quiet=False, reverse=False, delimiter="|||", score_column = 3, max_sourcen = 0,sourceencoder=None, targetencoder=None, scorefilter=None):
        """Load a phrase table from file into memory (memory intensive!)"""
        self.phrasetable = {}
        self.sourceencoder = sourceencoder
        self.targetencoder = targetencoder


        if filename.split(".")[-1] == "bz2":
            f = bz2.BZ2File(filename,'r')
        elif filename.split(".")[-1] == "gz":
            f = gzip.GzipFile(filename,'r')
        else:
            f = io.open(filename,'r',encoding='utf-8')
        linenum = 0
        prevsource = None
        targets = []

        while True:
            if not quiet:
                linenum += 1
                if (linenum % 100000) == 0:
                    print("Loading phrase-table: @%d" % linenum, "\t(" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ")",file=sys.stderr)
            line = u(f.readline())
            if not line:
                break

            #split into (trimmed) segments
            segments = [ segment.strip() for segment in line.split(delimiter) ]

            if len(segments) < 3:
                print("Invalid line: ", line, file=sys.stderr)
                continue

            #Do we have a score associated?
            if score_column > 0 and len(segments) >= score_column:
                scores = tuple( ( float(x) for x in segments[score_column-1].strip().split() ) )
            else:
                scores = tuple()

            #if align2_column > 0:
            #    try:
            #        null_alignments = segments[align2_column].count("()")
            #    except:
            #        null_alignments = 0
            #else:
            #    null_alignments = 0

            if scorefilter:
                if not scorefilter(scores): continue

            if reverse:
                if max_sourcen > 0 and segments[1].count(' ') + 1 > max_sourcen:
                    continue

                if self.sourceencoder:
                    source = self.sourceencoder(segments[1]) #tuple(segments[1].split(" "))
                else:
                    source = segments[1]
                if self.targetencoder:
                    target = self.targetencoder(segments[0]) #tuple(segments[0].split(" "))
                else:
                    target = segments[0]
            else:
                if max_sourcen > 0 and segments[0].count(' ') + 1 > max_sourcen:
                    continue

                if self.sourceencoder:
                    source = self.sourceencoder(segments[0]) #tuple(segments[0].split(" "))
                else:
                    source = segments[0]
                if self.targetencoder:
                    target = self.targetencoder(segments[1]) #tuple(segments[1].split(" "))
                else:
                    target = segments[1]


            if prevsource and source != prevsource and targets:
                self.phrasetable[prevsource] = tuple(targets)
                targets = []

            targets.append( (target,scores) )
            prevsource = source

        #don't forget last one:
        if prevsource and targets:
            self.phrasetable[prevsource] = tuple(targets)

        f.close()
Beispiel #22
0
    def __init__(self,
                 filename,
                 quiet=False,
                 reverse=False,
                 delimiter="|||",
                 score_column=3,
                 max_sourcen=0,
                 sourceencoder=None,
                 targetencoder=None,
                 scorefilter=None):
        """Load a phrase table from file into memory (memory intensive!)"""
        self.phrasetable = {}
        self.sourceencoder = sourceencoder
        self.targetencoder = targetencoder

        if filename.split(".")[-1] == "bz2":
            f = bz2.BZ2File(filename, 'r')
        elif filename.split(".")[-1] == "gz":
            f = gzip.GzipFile(filename, 'r')
        else:
            f = io.open(filename, 'r', encoding='utf-8')
        linenum = 0
        prevsource = None
        targets = []

        while True:
            if not quiet:
                linenum += 1
                if (linenum % 100000) == 0:
                    print(
                        "Loading phrase-table: @%d" % linenum,
                        "\t(" +
                        datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +
                        ")",
                        file=sys.stderr)
            line = u(f.readline())
            if not line:
                break

            #split into (trimmed) segments
            segments = [segment.strip() for segment in line.split(delimiter)]

            if len(segments) < 3:
                print("Invalid line: ", line, file=sys.stderr)
                continue

            #Do we have a score associated?
            if score_column > 0 and len(segments) >= score_column:
                scores = tuple(
                    (float(x)
                     for x in segments[score_column - 1].strip().split()))
            else:
                scores = tuple()

            #if align2_column > 0:
            #    try:
            #        null_alignments = segments[align2_column].count("()")
            #    except:
            #        null_alignments = 0
            #else:
            #    null_alignments = 0

            if scorefilter:
                if not scorefilter(scores): continue

            if reverse:
                if max_sourcen > 0 and segments[1].count(' ') + 1 > max_sourcen:
                    continue

                if self.sourceencoder:
                    source = self.sourceencoder(
                        segments[1])  #tuple(segments[1].split(" "))
                else:
                    source = segments[1]
                if self.targetencoder:
                    target = self.targetencoder(
                        segments[0])  #tuple(segments[0].split(" "))
                else:
                    target = segments[0]
            else:
                if max_sourcen > 0 and segments[0].count(' ') + 1 > max_sourcen:
                    continue

                if self.sourceencoder:
                    source = self.sourceencoder(
                        segments[0])  #tuple(segments[0].split(" "))
                else:
                    source = segments[0]
                if self.targetencoder:
                    target = self.targetencoder(
                        segments[1])  #tuple(segments[1].split(" "))
                else:
                    target = segments[1]

            if prevsource and source != prevsource and targets:
                self.phrasetable[prevsource] = tuple(targets)
                targets = []

            targets.append((target, scores))
            prevsource = source

        #don't forget last one:
        if prevsource and targets:
            self.phrasetable[prevsource] = tuple(targets)

        f.close()
Beispiel #23
0
 def __unicode__(self):  #Python 2.x
     return u(self.value)
Beispiel #24
0
 def __unicode__(self): #Python 2.x
     return u(self.value)
Beispiel #25
0
     def process(self, words, debug=False):
        if self.mode == 'file':
            line = self.tagger.next()
            newwords = []
            postags = []
            lemmas = []    
            for item in line.split(' '):                            
                if item.strip():
                    try:
                        word,lemma,pos = item.split('|')
                    except:
                        raise Exception("Unable to parse word|lemma|pos in " + item)
                    newwords.append(word)
                    postags.append(pos)
                    lemmas.append(lemma)
            return newwords, postags, lemmas
        elif self.mode == "frog":
            newwords = []
            postags = []
            lemmas = []             
            for fields in self.tagger.process(' '.join(words)):
                word,lemma,morph,pos = fields[:4]
                newwords.append(word)
                postags.append(pos)
                lemmas.append(lemma)
            return newwords, postags, lemmas                
        elif self.mode == "freeling":
            postags = []
            lemmas = []
            for fields in self.tagger.process(words, debug):
                word, lemma,pos = fields[:3]
                postags.append(pos)
                lemmas.append(lemma)
            return words, postags, lemmas            
        elif self.mode == "corenlp":            
            data = json.loads(self.tagger.parse(" ".join(words)))
            words = []
            postags = []
            lemmas = []
            for sentence in data['sentences']:
                for word, worddata in sentence['words']:
                    words.append(word)
                    lemmas.append(worddata['Lemma'])
                    postags.append(worddata['PartOfSpeech'])
            return words, postags, lemmas
        elif self.mode == 'lookup':
            postags = []
            lemmas = []
            for word in words:
                try:
                    lemma, pos = self.tagger[word.lower()]
                    lemmas.append(lemma)
                    postags.append(pos)
                except KeyError: 
                    lemmas.append(word)
                    postags.append('?')
            return words, postags, lemmas
        elif self.mode == 'treetagger':
            s = " ".join(words)
            s = u(s)
            
            p = subprocess.Popen([self.tagger], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)            
            (out, err) = p.communicate(s.encode('utf-8'))

            newwords = []
            postags = []
            lemmas = []
            for line in out.split('\n'):
                line = line.strip()
                if line:
                    fields = line.split('\t')
                    newwords.append( unicode(fields[0],'utf-8') )
                    postags.append( unicode(fields[1],'utf-8') )
                    lemmas.append( unicode(fields[2],'utf-8') )
                                        
            if p.returncode != 0:
                print(err,file=stderr)
                raise OSError('TreeTagger failed')
        
            return newwords, postags, lemmas
        else:
            raise Exception("Unknown mode")