Beispiel #1
0
 def __init__(self):
   self.__L1Tree = Parsetree()
   self.__L2Tree = Parsetree()
   self.__align = {}
   self.__utils = Utils()
   self.__phraseMap = {}
Beispiel #2
0
class CSHandler:
  def __init__(self):
    self.__L1Tree = Parsetree()
    self.__L2Tree = Parsetree()
    self.__align = {}
    self.__utils = Utils()
    self.__phraseMap = {}
  
  def updatePhraseMap(self, phraseMap):
    self.__phraseMap = phraseMap
  
  def updateHandler(self, l1Sent, l2Sent, alignLine, l1Index):
    l2Index = 1-l1Index
    if l1Index:
      self.__L1Tree.updateTree(l2Sent)
      self.__L2Tree.updateTree(l1Sent)
    else:
      self.__L1Tree.updateTree(l1Sent)
      self.__L2Tree.updateTree(l2Sent)
    self.__align = self.__parseAlign(alignLine, l1Index, l2Index)
    
  def __parseAlign(self, alignLine, l1Index, l2Index):
    align = {}
    for i in alignLine.split():
      key = int(i.split("-")[l1Index])
      value = int(i.split("-")[l2Index])
      if key in align.keys():
        align[key].append(value)
      else:
        align[key] = [value]
    return align
    
  def csSentence(self, csType):
    validSequences = self.__utils.validSequences(self.__L1Tree.sentLen())
    # Debugging !!
    #sys.stderr.write("L1SeqL2Cont Valid Sequences:\n")
    #for sequence in validSequences:
    #    sys.stderr.write(str(sequence)+"\n")
    #dummy=raw_input()
    ###############
    # Assumption that a sentence will have a single code switch.
    sequence = self.__selectSequence(validSequences, csType) 
    # Debugging !!
    #sys.stderr.write("L1SeqL2Cont Selected Sequence: "+str(sequence)+"\n")
    #sys.stderr.write(l1Parse)
    #sys.stderr.write(l2Parse)
    #dummy=raw_input()
    if sequence == -1:
      return [-1,[]]
    csSentence = self.__utils.makeCSSentence(self.__L1Tree.wordTags(), sequence, self.__L2Tree.wordTags(), self.__align)
    return [csSentence,sequence]
    
  ## Assumptions:
  ## There is always a single code switch
  ## The selection among the valid candidate sequences is random
  def __selectSequence(self, validSequences, csType):
    if csType == 0:
      return self.__seqL1SeqL2Cont(validSequences)
    elif csType == 1:
      return self.__seqL1ConstL2Cont(validSequences)
    elif csType == 2:
      return self.__seqL1ConstL2Const(validSequences)
    elif csType == 3:
      return self.__seqL1ConstL2SameConst(validSequences)
    
  def __seqL1SeqL2Cont(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = []
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1SeqL2Cont"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L2Tree.isContiguous(l2Sequence):
        # Debugging !!
        #sys.stderr.write("Alignment: "+str(align)+"\n")
        #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n")
        ##############
        break
    return l1Sequence
 
  def __seqL1ConstL2Cont(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = []
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1Const2Cont"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isContiguous(l2Sequence):
          # Debugging !!
          #sys.stderr.write("Alignment: "+str(align)+"\n")
          #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n")
          ##############
          break
    return l1Sequence
  
  def __seqL1ConstL2Const(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = []
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1Const2Const"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isConstituent(l2Sequence):
          # Debugging !!
          #sys.stderr.write("Alignment: "+str(align)+"\n")
          #sys.stderr.write("L2Sequence: "+str(l2Sequence)+"\n")
          ##############
          break
    return l1Sequence
  
  def __seqL1ConstL2SameConst(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = []
    count = 0
    while 1:
      count += 1
      if count%500 == 0:
        return -1
        sys.stderr.write("L1Const2SameConst"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isConstituent(l2Sequence):
          l1PhraseTag = self.__L1Tree.getPhrase(l1Sequence[0])
          l2PhraseTag = self.__L2Tree.getPhrase(l2Sequence[0])
          ## Both the phrases are same, for dual structure principle
          if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]:
            # Debugging !!
            #sys.stderr.write("Alignment: "+str(align)+"\n")
            #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n")
            ##############
            break
    return l1Sequence