def __init__(self): self.__L1Tree = Parsetree() self.__L2Tree = Parsetree() self.__align = {} self.__utils = Utils() self.__phraseMap = {}
class CSHandler: def __init__(self): self.__L1Tree = Parsetree() self.__L2Tree = Parsetree() self.__align = {} self.__utils = Utils() self.__phraseMap = {} def updatePhraseMap(self, phraseMap): self.__phraseMap = phraseMap def updateHandler(self, l1Sent, l2Sent, alignLine, l1Index): l2Index = 1-l1Index if l1Index: self.__L1Tree.updateTree(l2Sent) self.__L2Tree.updateTree(l1Sent) else: self.__L1Tree.updateTree(l1Sent) self.__L2Tree.updateTree(l2Sent) self.__align = self.__parseAlign(alignLine, l1Index, l2Index) def __parseAlign(self, alignLine, l1Index, l2Index): align = {} for i in alignLine.split(): key = int(i.split("-")[l1Index]) value = int(i.split("-")[l2Index]) if key in align.keys(): align[key].append(value) else: align[key] = [value] return align def csSentence(self, csType): validSequences = self.__utils.validSequences(self.__L1Tree.sentLen()) # Debugging !! #sys.stderr.write("L1SeqL2Cont Valid Sequences:\n") #for sequence in validSequences: # sys.stderr.write(str(sequence)+"\n") #dummy=raw_input() ############### # Assumption that a sentence will have a single code switch. sequence = self.__selectSequence(validSequences, csType) # Debugging !! #sys.stderr.write("L1SeqL2Cont Selected Sequence: "+str(sequence)+"\n") #sys.stderr.write(l1Parse) #sys.stderr.write(l2Parse) #dummy=raw_input() if sequence == -1: return [-1,[]] csSentence = self.__utils.makeCSSentence(self.__L1Tree.wordTags(), sequence, self.__L2Tree.wordTags(), self.__align) return [csSentence,sequence] ## Assumptions: ## There is always a single code switch ## The selection among the valid candidate sequences is random def __selectSequence(self, validSequences, csType): if csType == 0: return self.__seqL1SeqL2Cont(validSequences) elif csType == 1: return self.__seqL1ConstL2Cont(validSequences) elif csType == 2: return self.__seqL1ConstL2Const(validSequences) elif csType == 3: return self.__seqL1ConstL2SameConst(validSequences) def __seqL1SeqL2Cont(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = [] count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1SeqL2Cont"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L2Tree.isContiguous(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1ConstL2Cont(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = [] count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1Const2Cont"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isContiguous(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1ConstL2Const(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = [] count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1Const2Const"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isConstituent(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("L2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1ConstL2SameConst(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = [] count = 0 while 1: count += 1 if count%500 == 0: return -1 sys.stderr.write("L1Const2SameConst"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isConstituent(l2Sequence): l1PhraseTag = self.__L1Tree.getPhrase(l1Sequence[0]) l2PhraseTag = self.__L2Tree.getPhrase(l2Sequence[0]) ## Both the phrases are same, for dual structure principle if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]: # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n") ############## break return l1Sequence