def __init__(self): self.__something = 0 self.__Alignments = Alignments()
class Utils: def __init__(self): self.__something = 0 self.__Alignments = Alignments() def readSentences(self, fileName): reader = open(fileName) sents = [] while 1: sent = self.getNextSentence(reader) if len(sent) == 0: break sents.append(sent) return sents def readSentencesPlain(self, fileName): reader = open(fileName) sents = [] while 1: sent = self.getNextSentencePlain(reader) if len(sent) == 0: break sents.append(sent) return sents def readAligns(self, l1Aligns, l2Aligns): l1Reader = open(l1Aligns) l2Reader = open(l2Aligns) aligns = [] while 1: l1Align = self.getNextSentence(l1Reader) l2Align = self.getNextSentence(l2Reader) #print l1Alignprint l2Align if len(l1Align) == 0: break align = self.__Alignments.createAlignDict(l1Align, l2Align) aligns.append(align) ##break return aligns def getNextSentence(self, filePtr): sent = [] flag = 0 while 1: line = filePtr.readline() if not line: break if line.find("<Sentence") == 0: flag = 1 if flag == 1: sent.append(line) if line.find("</Sentence") == 0: flag = 0 break return sent def getNextSentencePlain(self, filePtr): sent = [] while 1: line = filePtr.readline() if not line: break if line.strip() == "": return sent sent.append(line.strip().split('\t')) return sent def validSequences(self, length): sequences = [] for i in range(length): tempSequence = [] for j in range(i,length): tempSequence.append(j) newTemp = [] for k in tempSequence: if k not in newTemp: newTemp.append(k) newTemp.sort() if len(newTemp)!=length: sequences.append(newTemp) return sequences def makeCSSentence(self, l1Sent, l1Sequence, l2Sent, align, L1, L2): #print l2Sent;#print sequence;#print align;#print l1Sent;#print sequence; #sys.stderr.write("l1Sent:"+str(l1Sent)+'\n') #sys.stderr.write("l2Sent:"+str(l2Sent)+'\n') csSent = [] addedFlag = 0 l2Sequence = self.l2Sequence(l1Sequence, align) #print l1Sequence, l2Sequence for i in range(len(l1Sent)): if i < l1Sequence[0] or i> l1Sequence[-1]: newWT = [p for p in l1Sent[i]] newWT.append(L1) csSent.append(newWT) elif addedFlag == 0: for j in l2Sequence: try: #sys.stderr.write("j:"+str(j)+" ") newWT = [p for p in l2Sent[j]] newWT.append(L2) csSent.append(newWT) except: pass addedFlag = 1 ##sys.stderr.write("\n") return csSent def wordTags(self, sentence): ## Note the delimiter. It's _# and not _ :P wordTags = [] sentence = sentence.strip() spaceSplitSent = sentence.split() for i in range(len(spaceSplitSent)): if spaceSplitSent[i][-1] == ")": wordTags.append([spaceSplitSent[i].strip(")"), spaceSplitSent[i-1].strip("(")]) return wordTags def l2Sequence(self, l1Sequence, align): l2Sequence = [] for i in l1Sequence: if i in align.keys(): # What about else? Don't need to worry because the sequence in use is already checked for contiguity on L2 side!! for j in align[i]: if j not in l2Sequence: l2Sequence.append(j) l2Sequence.sort() return l2Sequence