def __iter__(self): self.f.seek(0) nextlinebuffer = u(next(self.f)) sentenceindex = 0 done = False while not done: sentenceindex += 1 line = nextlinebuffer if line[0] != "#": raise Exception( "Error parsing GIZA++ Alignment at sentence " + str(sentenceindex) + ", expected new fragment, found: " + repr(line) ) targetline = u(next(self.f)) sourceline = u(next(self.f)) yield GizaSentenceAlignment(sourceline, targetline, sentenceindex) try: nextlinebuffer = u(next(self.f)) except StopIteration: done = True
def process(self,input_data, source_encoding="utf-8", return_unicode = True, oldfrog=False): """Receives input_data in the form of a str or unicode object, passes this to the server, with proper consideration for the encodings, and returns the Frog output as a list of tuples: (word,pos,lemma,morphology), each of these is a proper unicode object unless return_unicode is set to False, in which case raw strings will be returned. Return_unicode is no longer optional, it is fixed to True, parameter is still there only for backwards-compatibility.""" if isinstance(input_data, list) or isinstance(input_data, tuple): input_data = " ".join(input_data) input_data = u(input_data, source_encoding) #decode (or preferably do this in an earlier stage) input_data = input_data.strip(' \t\n') s = input_data.encode(self.server_encoding) +b'\r\n' if not oldfrog: s += b'EOT\r\n' self.socket.sendall(s) #send to socket in desired encoding output = [] done = False while not done: data = b"" while not data.endswith(b'\n'): moredata = self.socket.recv(self.BUFSIZE) if not moredata: break data += moredata data = u(data,self.server_encoding) for line in data.strip(' \t\r\n').split('\n'): if line == "READY": done = True break elif line: line = line.split('\t') #split on tab if len(line) > 4 and line[0].isdigit(): #first column is token number if line[0] == '1' and output: if self.returnall: output.append( (None,None,None,None, None,None,None, None) ) else: output.append( (None,None,None,None) ) fields = line[1:] parse1=parse2=ner=chunk="" word,lemma,morph,pos = fields[0:4] if len(fields) > 5: ner = fields[5] if len(fields) > 6: chunk = fields[6] if len(fields) >= 8: parse1 = fields[7] parse2 = fields[8] if len(fields) < 5: raise Exception("Can't process response line from Frog: ", repr(line), " got unexpected number of fields ", str(len(fields) + 1)) if self.returnall: output.append( (word,lemma,morph,pos,ner,chunk,parse1,parse2) ) else: output.append( (word,lemma,morph,pos) ) return output
def process(self, input_data, source_encoding="utf-8", return_unicode=True, oldfrog=False): """Receives input_data in the form of a str or unicode object, passes this to the server, with proper consideration for the encodings, and returns the Frog output as a list of tuples: (word,pos,lemma,morphology), each of these is a proper unicode object unless return_unicode is set to False, in which case raw strings will be returned. Return_unicode is no longer optional, it is fixed to True, parameter is still there only for backwards-compatibility.""" if isinstance(input_data, list) or isinstance(input_data, tuple): input_data = " ".join(input_data) input_data = u(input_data, source_encoding) # decode (or preferably do this in an earlier stage) input_data = input_data.strip(" \t\n") s = input_data.encode(self.server_encoding) + b"\r\n" if not oldfrog: s += b"EOT\r\n" self.socket.sendall(s) # send to socket in desired encoding output = [] done = False while not done: data = b"" while not data.endswith(b"\n"): moredata = self.socket.recv(self.BUFSIZE) if not moredata: break data += moredata data = u(data, self.server_encoding) for line in data.strip(" \t\r\n").split("\n"): if line == "READY": done = True break elif line: line = line.split("\t") # split on tab if len(line) > 4 and line[0].isdigit(): # first column is token number if line[0] == "1" and output: if self.returnall: output.append((None, None, None, None, None, None, None, None)) else: output.append((None, None, None, None)) fields = line[1:] parse1 = parse2 = ner = chunk = "" word, lemma, morph, pos = fields[0:4] if len(fields) > 5: ner = fields[5] if len(fields) > 6: chunk = fields[6] if len(fields) < 5: raise Exception( "Can't process response line from Frog: ", repr(line), " got unexpected number of fields ", str(len(fields) + 1), ) if self.returnall: output.append((word, lemma, morph, pos, ner, chunk, parse1, parse2)) else: output.append((word, lemma, morph, pos)) return output
def __str__(self): if not self.computed: self.compute() o = "%-15s TP\tFP\tTN\tFN\tAccuracy\tPrecision\tRecall(TPR)\tSpecificity(TNR)\tF-score\n" % ("") for cls in sorted(set(self.classes)): cls = u(cls) o += "%-15s %d\t%d\t%d\t%d\t%4f\t%4f\t%4f\t%4f\t%4f\n" % (cls, self.tp[cls], self.fp[cls], self.tn[cls], self.fn[cls], self.accuracy(cls), self.precision(cls), self.recall(cls),self.specificity(cls), self.fscore(cls) ) return o + "\n" + self.outputmetrics()
def startcommand(self, command, cwd, stdout, stderr, *arguments, **parameters): argdelimiter=' ' printcommand = True cmd = command if arguments: cmd += ' ' + " ".join([ u(x) for x in arguments]) if parameters: for key, value in parameters.items(): if key == 'argdelimiter': argdelimiter = value elif key == 'printcommand': printcommand = value elif isinstance(value, bool) and value == True: cmd += ' ' + key elif key[-1] != '=': cmd += ' ' + key + argdelimiter + str(value) else: cmd += ' ' + key + str(value) if printcommand: print("STARTING COMMAND: " + cmd, file=stderr) self.begintime = datetime.datetime.now() if not cwd: self.process = subprocess.Popen(cmd, shell=True,stdout=stdout,stderr=stderr) else: self.process = subprocess.Popen(cmd, shell=True,cwd=cwd,stdout=stdout,stderr=stderr) #pid = process.pid #os.waitpid(pid, 0) #wait for process to finish return self.process
def __getitem__(self, phrase): solutions = [] if phrase != self.lastquery: self.socket.send(phrase + "\r\n") data = b"" while not data or data[-1] != '\n': data += self.socket.recv(self.BUFSIZE) else: data = self.lastresponse data = u(data) for line in data.split('\n'): line = line.strip('\r\n') if line == "NOTFOUND": raise KeyError(phrase) elif line: fields = tuple(line.split("\t")) if len(fields) == 4: solutions.append(fields) else: print >> sys.stderr, "PHRASETABLECLIENT WARNING: Unable to parse response line" self.lastresponse = data self.lastquery = phrase return solutions
def __getitem__(self, phrase): solutions = [] if phrase != self.lastquery: self.socket.send(phrase+ "\r\n") data = b"" while not data or data[-1] != '\n': data += self.socket.recv(self.BUFSIZE) else: data = self.lastresponse data = u(data) for line in data.split('\n'): line = line.strip('\r\n') if line == "NOTFOUND": raise KeyError(phrase) elif line: fields = tuple(line.split("\t")) if len(fields) == 4: solutions.append( fields ) else: print >>sys.stderr,"PHRASETABLECLIENT WARNING: Unable to parse response line" self.lastresponse = data self.lastquery = phrase return solutions
def process(self, sourcewords, debug=False): """Process a list of words, passing it to the server and realigning the output with the original words""" if isinstance( sourcewords, list ) or isinstance( sourcewords, tuple ): sourcewords_s = " ".join(sourcewords) else: sourcewords_s = sourcewords sourcewords = sourcewords.split(' ') self.socket.sendall(sourcewords_s.encode(self.encoding) +'\n\0') if debug: print("Sent:",sourcewords_s.encode(self.encoding),file=sys.stderr) results = [] done = False while not done: data = b"" while not data: buffer = self.socket.recv(self.BUFSIZE) if debug: print("Buffer: ["+repr(buffer)+"]",file=sys.stderr) if buffer[-1] == '\0': data += buffer[:-1] done = True break else: data += buffer data = u(data,self.encoding) if debug: print("Received:",data,file=sys.stderr) for i, line in enumerate(data.strip(' \t\0\r\n').split('\n')): if not line.strip(): done = True break else: cols = line.split(" ") subwords = cols[0].lower().split("_") if len(cols) > 2: #this seems a bit odd? for word in subwords: #split multiword expressions results.append( (word, cols[1], cols[2], i, len(subwords) > 1 ) ) #word, lemma, pos, index, multiword? sourcewords = [ w.lower() for w in sourcewords ] alignment = [] for i, sourceword in enumerate(sourcewords): found = False best = 0 distance = 999999 for j, (targetword, lemma, pos, index, multiword) in enumerate(results): if sourceword == targetword and abs(i-j) < distance: found = True best = j distance = abs(i-j) if found: alignment.append(results[best]) else: alignment.append((None,None,None,None,False)) #no alignment found return alignment
def output(self,delimiter = '\t', addnormalised=False): """Print a representation of the frequency list""" for type, count in self: if isinstance(type,tuple) or isinstance(type,list): if addnormalised: yield " ".join((u(x) for x in type)) + delimiter + str(count) + delimiter + str(count/self.total) else: yield " ".join((u(x) for x in type)) + delimiter + str(count) elif isstring(type): if addnormalised: yield type + delimiter + str(count) + delimiter + str(count/self.total) else: yield type + delimiter + str(count) else: if addnormalised: yield str(type) + delimiter + str(count) + delimiter + str(count/self.total) else: yield str(type) + delimiter + str(count)
def __iter__(self): self.f.seek(0) nextlinebuffer = u(next(self.f)) sentenceindex = 0 done = False while not done: sentenceindex += 1 line = nextlinebuffer if line[0] != '#': raise Exception("Error parsing GIZA++ Alignment at sentence " + str(sentenceindex) + ", expected new fragment, found: " + repr(line)) targetline = u(next(self.f)) sourceline = u(next(self.f)) yield GizaSentenceAlignment(sourceline, targetline, sentenceindex) try: nextlinebuffer = u(next(self.f)) except StopIteration: done = True
def output(self, delimiter='\t', addnormalised=False): """Print a representation of the frequency list""" for type, count in self: if isinstance(type, tuple) or isinstance(type, list): if addnormalised: yield " ".join((u(x) for x in type)) + delimiter + str( count) + delimiter + str(count / self.total) else: yield " ".join( (u(x) for x in type)) + delimiter + str(count) elif isstring(type): if addnormalised: yield type + delimiter + str(count) + delimiter + str( count / self.total) else: yield type + delimiter + str(count) else: if addnormalised: yield str(type) + delimiter + str(count) + delimiter + str( count / self.total) else: yield str(type) + delimiter + str(count)
def __iter__(self): # by Sander Canisius line = self.stream.readline() while line: assert line.startswith("#") src = self.stream.readline().split() trg = [] alignment = [None for i in xrange(len(src))] for i, (targetWord, positions) in enumerate(parseAlignment(self.stream.readline().split())): trg.append(targetWord) for pos in positions: assert alignment[pos - 1] is None alignment[pos - 1] = i if self.encoding: yield [u(w, self.encoding) for w in src], [u(w, self.encoding) for w in trg], alignment else: yield src, trg, alignment line = self.stream.readline()
def __iter__(self): #by Sander Canisius line = self.stream.readline() while line: assert line.startswith("#") src = self.stream.readline().split() trg = [] alignment = [None for i in xrange(len(src))] for i, (targetWord, positions) in enumerate(parseAlignment(self.stream.readline().split())): trg.append(targetWord) for pos in positions: assert alignment[pos - 1] is None alignment[pos - 1] = i if self.encoding: yield [ u(w,self.encoding) for w in src ], [ u(w,self.encoding) for w in trg ], alignment else: yield src, trg, alignment line = self.stream.readline()
def flush(sentences): if sentences: print("Processing " + str(len(sentences)) + " lines",file=stderr) for sentence in sentences: out = "" p = subprocess.Popen([self.tagger], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (results, err) = p.communicate("\n".join(sentences).encode('utf-8')) for line in results.split('\n'): line = line.strip() if line: fields = line.split('\t') word = fields[0] pos = fields[1] lemma = fields[2] if oneperline: if out: out += "\n" out += word + "\t" + lemma + "\t" + pos else: if out: out += " " if '|' in word: word = word.replace('|','_') if '|' in lemma: lemma = lemma.replace('|','_') if '|' in pos: pos = pos.replace('|','_') out += word + "|" + lemma + "|" + pos if pos[0] == '$': out = u(out) f_out.write(out + "\n") if oneperline: f_out.write("\n") out = "" if out: out = u(out) f_out.write(out + "\n") if oneperline: f_out.write("\n")
def __contains__(self, phrase): self.socket.send(phrase.encode('utf-8')+ b"\r\n")\ data = b"" while not data or data[-1] != '\n': data += self.socket.recv(self.BUFSIZE) data = u(data) for line in data.split('\n'): line = line.strip('\r\n') if line == "NOTFOUND": return False self.lastresponse = data self.lastquery = phrase return True
def process(self, sourcewords, debug=False): """Process a list of words, passing it to the server and realigning the output with the original words""" if isinstance(sourcewords, list) or isinstance(sourcewords, tuple): sourcewords_s = " ".join(sourcewords) else: sourcewords_s = sourcewords sourcewords = sourcewords.split(' ') self.socket.sendall(sourcewords_s.encode(self.encoding) + '\n\0') if debug: print("Sent:", sourcewords_s.encode(self.encoding), file=sys.stderr) results = [] done = False while not done: data = b"" while not data: buffer = self.socket.recv(self.BUFSIZE) if debug: print("Buffer: [" + repr(buffer) + "]", file=sys.stderr) if buffer[-1] == '\0': data += buffer[:-1] done = True break else: data += buffer data = u(data, self.encoding) if debug: print("Received:", data, file=sys.stderr) for i, line in enumerate(data.strip(' \t\0\r\n').split('\n')): if not line.strip(): done = True break else: cols = line.split(" ") subwords = cols[0].lower().split("_") if len(cols) > 2: #this seems a bit odd? for word in subwords: #split multiword expressions results.append( (word, cols[1], cols[2], i, len(subwords) > 1)) #word, lemma, pos, index, multiword? sourcewords = [w.lower() for w in sourcewords] alignment = [] for i, sourceword in enumerate(sourcewords): found = False best = 0 distance = 999999 for j, (targetword, lemma, pos, index, multiword) in enumerate(results): if sourceword == targetword and abs(i - j) < distance: found = True best = j distance = abs(i - j) if found: alignment.append(results[best]) else: alignment.append( (None, None, None, None, False)) #no alignment found return alignment
def _sanitize(self, word_id): return u(word_id)
def __init__(self,filename, quiet=False, reverse=False, delimiter="|||", score_column = 3, max_sourcen = 0,sourceencoder=None, targetencoder=None, scorefilter=None): """Load a phrase table from file into memory (memory intensive!)""" self.phrasetable = {} self.sourceencoder = sourceencoder self.targetencoder = targetencoder if filename.split(".")[-1] == "bz2": f = bz2.BZ2File(filename,'r') elif filename.split(".")[-1] == "gz": f = gzip.GzipFile(filename,'r') else: f = io.open(filename,'r',encoding='utf-8') linenum = 0 prevsource = None targets = [] while True: if not quiet: linenum += 1 if (linenum % 100000) == 0: print("Loading phrase-table: @%d" % linenum, "\t(" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ")",file=sys.stderr) line = u(f.readline()) if not line: break #split into (trimmed) segments segments = [ segment.strip() for segment in line.split(delimiter) ] if len(segments) < 3: print("Invalid line: ", line, file=sys.stderr) continue #Do we have a score associated? if score_column > 0 and len(segments) >= score_column: scores = tuple( ( float(x) for x in segments[score_column-1].strip().split() ) ) else: scores = tuple() #if align2_column > 0: # try: # null_alignments = segments[align2_column].count("()") # except: # null_alignments = 0 #else: # null_alignments = 0 if scorefilter: if not scorefilter(scores): continue if reverse: if max_sourcen > 0 and segments[1].count(' ') + 1 > max_sourcen: continue if self.sourceencoder: source = self.sourceencoder(segments[1]) #tuple(segments[1].split(" ")) else: source = segments[1] if self.targetencoder: target = self.targetencoder(segments[0]) #tuple(segments[0].split(" ")) else: target = segments[0] else: if max_sourcen > 0 and segments[0].count(' ') + 1 > max_sourcen: continue if self.sourceencoder: source = self.sourceencoder(segments[0]) #tuple(segments[0].split(" ")) else: source = segments[0] if self.targetencoder: target = self.targetencoder(segments[1]) #tuple(segments[1].split(" ")) else: target = segments[1] if prevsource and source != prevsource and targets: self.phrasetable[prevsource] = tuple(targets) targets = [] targets.append( (target,scores) ) prevsource = source #don't forget last one: if prevsource and targets: self.phrasetable[prevsource] = tuple(targets) f.close()
def __init__(self, filename, quiet=False, reverse=False, delimiter="|||", score_column=3, max_sourcen=0, sourceencoder=None, targetencoder=None, scorefilter=None): """Load a phrase table from file into memory (memory intensive!)""" self.phrasetable = {} self.sourceencoder = sourceencoder self.targetencoder = targetencoder if filename.split(".")[-1] == "bz2": f = bz2.BZ2File(filename, 'r') elif filename.split(".")[-1] == "gz": f = gzip.GzipFile(filename, 'r') else: f = io.open(filename, 'r', encoding='utf-8') linenum = 0 prevsource = None targets = [] while True: if not quiet: linenum += 1 if (linenum % 100000) == 0: print( "Loading phrase-table: @%d" % linenum, "\t(" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + ")", file=sys.stderr) line = u(f.readline()) if not line: break #split into (trimmed) segments segments = [segment.strip() for segment in line.split(delimiter)] if len(segments) < 3: print("Invalid line: ", line, file=sys.stderr) continue #Do we have a score associated? if score_column > 0 and len(segments) >= score_column: scores = tuple( (float(x) for x in segments[score_column - 1].strip().split())) else: scores = tuple() #if align2_column > 0: # try: # null_alignments = segments[align2_column].count("()") # except: # null_alignments = 0 #else: # null_alignments = 0 if scorefilter: if not scorefilter(scores): continue if reverse: if max_sourcen > 0 and segments[1].count(' ') + 1 > max_sourcen: continue if self.sourceencoder: source = self.sourceencoder( segments[1]) #tuple(segments[1].split(" ")) else: source = segments[1] if self.targetencoder: target = self.targetencoder( segments[0]) #tuple(segments[0].split(" ")) else: target = segments[0] else: if max_sourcen > 0 and segments[0].count(' ') + 1 > max_sourcen: continue if self.sourceencoder: source = self.sourceencoder( segments[0]) #tuple(segments[0].split(" ")) else: source = segments[0] if self.targetencoder: target = self.targetencoder( segments[1]) #tuple(segments[1].split(" ")) else: target = segments[1] if prevsource and source != prevsource and targets: self.phrasetable[prevsource] = tuple(targets) targets = [] targets.append((target, scores)) prevsource = source #don't forget last one: if prevsource and targets: self.phrasetable[prevsource] = tuple(targets) f.close()
def __unicode__(self): #Python 2.x return u(self.value)
def process(self, words, debug=False): if self.mode == 'file': line = self.tagger.next() newwords = [] postags = [] lemmas = [] for item in line.split(' '): if item.strip(): try: word,lemma,pos = item.split('|') except: raise Exception("Unable to parse word|lemma|pos in " + item) newwords.append(word) postags.append(pos) lemmas.append(lemma) return newwords, postags, lemmas elif self.mode == "frog": newwords = [] postags = [] lemmas = [] for fields in self.tagger.process(' '.join(words)): word,lemma,morph,pos = fields[:4] newwords.append(word) postags.append(pos) lemmas.append(lemma) return newwords, postags, lemmas elif self.mode == "freeling": postags = [] lemmas = [] for fields in self.tagger.process(words, debug): word, lemma,pos = fields[:3] postags.append(pos) lemmas.append(lemma) return words, postags, lemmas elif self.mode == "corenlp": data = json.loads(self.tagger.parse(" ".join(words))) words = [] postags = [] lemmas = [] for sentence in data['sentences']: for word, worddata in sentence['words']: words.append(word) lemmas.append(worddata['Lemma']) postags.append(worddata['PartOfSpeech']) return words, postags, lemmas elif self.mode == 'lookup': postags = [] lemmas = [] for word in words: try: lemma, pos = self.tagger[word.lower()] lemmas.append(lemma) postags.append(pos) except KeyError: lemmas.append(word) postags.append('?') return words, postags, lemmas elif self.mode == 'treetagger': s = " ".join(words) s = u(s) p = subprocess.Popen([self.tagger], shell=False, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (out, err) = p.communicate(s.encode('utf-8')) newwords = [] postags = [] lemmas = [] for line in out.split('\n'): line = line.strip() if line: fields = line.split('\t') newwords.append( unicode(fields[0],'utf-8') ) postags.append( unicode(fields[1],'utf-8') ) lemmas.append( unicode(fields[2],'utf-8') ) if p.returncode != 0: print(err,file=stderr) raise OSError('TreeTagger failed') return newwords, postags, lemmas else: raise Exception("Unknown mode")