Ejemplo n.º 1
0
          ## Both the phrases are same, for dual structure principle
          if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]:
            # Debugging !!
            #sys.stderr.write("Alignment: "+str(align)+"\n")
            #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n")
            ##############
            break
    return l1Sequence
  
if __name__ == "__main__":
  
  l2Data = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/engParse.wx"
  l1Data = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/hinParse.wx"
  l2Aligns = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/engAlign.wx"
  l1Aligns = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/hinAlign.wx"
  pureL1 = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/HinPOS/hindiTrain.wx"
  pureL2 = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/EngPOS/train.0-18.tsv"
  
  U = Utils()  
  parL1 = U.readSentences(l1Data)
  parL2 = U.readSentences(l2Data)
  align = U.readAligns(l1Aligns, l2Aligns)
  pureL1 = U.readSentencesPlain(pureL1)
  pureL2 = U.readSentencesPlain(pureL2)
 
  #print parL1[0]print parL2[0]print pureL1[0]print pureL2[0]
  print align[0]
  CS = CSHandler()
  CS.updateHandler(parL1[0], parL2[0], align[0], 0)
  print CS.csSentence(4)
  
Ejemplo n.º 2
0
class DataHandler:
  def __init__(self):
    ## Resources
    self.LID = ["HI", "EN"]
    self.l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/LastSem/Data/UniMaps/en-ptb.map"
    self.l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/LastSem/Data/UniMaps/hi-hyd.map"
    
    ## Containers
    self.parL1 = []
    self.parL2 = []
    self.align = []
    self.pureL1 = []
    self.pureL2 = []
    self.L1Tags = set()
    self.L2Tags = set()
    self.commonTags = set()
    self.posMap = {}
    
    ## Pre-processing
    self.genPosMap()
    
    ## Others
    self.__utils = Utils()
    
  def addLangTags(self, WordTags, lTag):
    wordTags = []
    for wt in WordTags:
      newWT = [i for i in wt]
      wordTags.append(newWT)
    for index in range(len(wordTags)):
      wordTags[index].append(lTag)
    return wordTags
  
  def makeLD(self, wordsTagsLangs):
    newLine = []
    for index in range(len(wordsTagsLangs)):
      wordTagLang = copy.deepcopy(wordsTagsLangs[index])
      wordTagLang[1] = wordTagLang[1] + '_' + wordTagLang[2]
      newLine.append(wordTagLang)
    return newLine
  
  def genPosMap(self):
    for i in open(self.l1MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.posMap[srcTag] = uniTag

    for i in open(self.l2MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.posMap[srcTag] = uniTag  
    
    self.L1Tags = set()
    for line in open(self.l1MapFile):
      tag = line.split()[0]
      self.L1Tags.add(tag)
    for line in open(self.l2MapFile):
      tag = line.split()[0]
      self.L2Tags.add(tag)
    self.commonTags = set([c for c in self.L1Tags if c in self.L2Tags])
  
  def map2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      try:
        newLine[index][1] = self.posMap[tag]
      except:
        newLine[index][1] = 'X'
    return newLine
  
  def mapLD2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      wordTagLang = copy.deepcopy(wordTagsLangs[index])
      tag = wordTagLang[1].split("_")[0]
      lang = wordTagLang[2] 
      try:
        wordTagLang[1] = self.posMap[tag] + "_" + lang 
      except:
        wordTagLang[1] = 'X' + "_" + lang 
      newLine.append(wordTagLang)
    return newLine
  
  def map2UniControl(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      lang = wordTagsLangs[index][2]
      try:
        newLine[index][1] = self.posMap[tag] + '_' + lang
      except:
        newLine[index][1] = 'X' + '_' + lang
    return newLine
  
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.parL1 = self.__utils.readSentences(l1Data)
    self.parL2 = self.__utils.readSentences(l2Data)
    self.align = self.__utils.readAligns(l1Aligns, l2Aligns)
    self.pureL1 = self.__utils.readSentencesPlain(pureL1Data)
    self.pureL2 = self.__utils.readSentencesPlain(pureL2Data)
    sys.stderr.write("parL1:" + str(len(self.parL1)) + "\n")
    sys.stderr.write("parL2:" + str(len(self.parL2)) + "\n")
    sys.stderr.write("align:" + str(len(self.align)) + "\n")
    sys.stderr.write("pureL1:" + str(len(self.pureL1)) + "\n")
    sys.stderr.write("pureL2:" + str(len(self.pureL2)) + "\n")
Ejemplo n.º 3
0
class DataGenerator:
  def __init__(self, outDir):
    sys.stderr.write("DataGenerator: Constructor\n")
    ## Languages and Order
    self.__LID = ["HI","EN"]
    self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
    self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
    ## Data containers
    self.__parL1 = []
    self.__parL2 = []
    self.__align = []
    self.__pureL1 = []
    self.__pureL2 = []
    self.__outputDir = outDir
    self.__posMap = {}
    self.__phraseMap = dd(list)
    self.__csInstance = CSHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    
    ## Generation Variants
    self.__csVariants = [0,1,2,3,4]
    self.__tagsetVariants = ["",".uni"]
    self.__dataRange = range(50,900,50)
    ##self.__dataRange = [200]
    self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
    self.__csHash = set()
    ##LID stuff
    self.__L1Tags = set()
    self.__L2Tags = set()
    self.__commonTags = set()
    ## Pre processing
    self.__genPosMap()
    self.__genPhraseMap()
    self.__csInstance.updatePhraseMap(self.__phraseMap)
    self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
    
    ## Real test overwrites
    #self.__csVariants = [1,2,3,4]
    self.__tagsetVariants = [""]
    self.__dataRange = [400]
    self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
    #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
    #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
    #self.__splits = [(50,50)]
    #for i in range(0,51,5):
    #  split = (100-i, i)
    #  self.__splits.append(split)
    self.__fileSuffix = ""
 
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__parL1 = self.__utils.readSentences(l1Data)
    self.__parL2 = self.__utils.readSentences(l2Data)
    self.__align = self.__utils.readAligns(l1Aligns, l2Aligns)
    self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data)
    self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data)
    sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n")
    sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n")
    sys.stderr.write("align:"+str(len(self.__align))+"\n")
    sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n")
    sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n")
  
  def __genTrainData(self):
    statusCount = 0
    for data in self.__dataRange:
      #control = 0
      #while 1:
      for Split in self.__splits:
      #for control in range(3):
        #if control == 3:
        #  break
        #pr= int(control*1.0/2 * data)
        pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
        tr = data - pr
        pr = pr/2
        
        print pr
        random.seed()
        pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
        pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
        
        for csType in self.__csVariants:
          self.__csHash = set()
          ##sys.stderr.write("csType:"+str(csType)+'\n')
          # Debugging !!
          #switch = ""
          #############
          #for tag in self.__tagsetVariants:
            # Debugging !!
            #if switch == "yes":
            #    break
            ###################
            #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n")
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          ##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w')
          ##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w')
          ##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w')
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            #sys.stderr.write("L1 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            #sys.stderr.write("L2 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              index = 0
            csLine = ""
            order = stopLength%2
            #sys.stderr.write("order:"+str(order)+'\n')
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            # Debugging !!                         
            #sys.stderr.write("Switch to another CS variant?? ")
            #switch = raw_input()
            #if switch == "yes":
            #    break
            ###############
            csLine = csReturn[0]
            #csSequence = csReturn[1]
            #print csReturn[1]
            hashKey = (index, tuple(csReturn[1]))
            #print hashKey
            if csLine != -1 and hashKey not in self.__csHash:
              self.__csHash.add(hashKey)
              stopLength -= 1
            else:
              continue
            #sys.stderr.write("csLine:"+str(csLine)+'\n')
            #csLine = self.__addLangTags(csLine)
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLine)))+'\n')
            csLineUni = self.__map2Uni(csLine)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLineUni)))+'\n')'''
            
          dataFile.close()
          ##dataFileUni.close()
          ##dataFileUniq.close()
          ##dataFileUniUniq.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            #pr -= 1
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
        #pr += 1
    print statusCount
    
  def __genTrainDataDiverse(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        for Split in self.__splits:
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          pr = pr/2
          
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          
          ##for csType in self.__csVariants:
          self.__csHash = set()
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          
          #### Dangerous ####
          
          ##pIndicesL1 = []
          ##pIndicesL2 = []
          
          #### End of Dangerous ####
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLine = ""
            order = stopLength%2
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            hashKey = (index, tuple(csReturn[1]))
            if csLine != -1 and hashKey not in self.__csHash:
              #self.__csHash.add(hashKey)
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            
          dataFile.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def __genTrainDataDup(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          ##for csType in self.__csVariants:
          self.__csHash = set()
          stopLength = tr
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLines = []
            for order in range(2):
            #order = stopLength%2
              self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
              csReturn = self.__csInstance.csSentence(csType)
              csLine = csReturn[0]
              if csLine != -1:
                csLines.append(csLine)
            if len(csLines) == 2:
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n')
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n')
              ##if splitIndex == 0:
              self.__Tree.updateTree(self.__parL1[index])
              pureLine = self.__Tree.wordTags()
              pureLine = self.__addLangTags(pureLine, self.__LID[0])
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__Tree.updateTree(self.__parL2[index])
              pureLine = self.__Tree.wordTags()
              #print pureLine
              pureLine = self.__addLangTags(pureLine, self.__LID[1])
              #print pureLine
              #sys.exit(0)
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def collectData(self):
    statusCount = 0
    for csType in [4]:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          if splitIndex > 0:
            return initialSlitCSData
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((self.__parL1[index],self.__parL2[index], self.__align[index]))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
  def __genFromSingleData(self):
    dataset = self.collectData()
    CSData = [d for d in dataset if len(d)==3]
    PUREData = [d[1] for d in dataset if len(d)==1]
    pureFile = open(self.__outputDir+"Baseline"+self.__fileSuffix,'w')
    pureFlag = 1
    for csType in self.__csVariants:
      for splitIndex in range(len(self.__splits)):
        Split = self.__splits[splitIndex]
        dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(len(dataset))+self.__fileSuffix,'w')
        stopLength = len(CSData)/2
        csData = []
        pureData = []
        index = -1
        while 1:
          index += 1
          if index == len(self.__parL1):
            ##break
            index = 0
            print "Still:",stopLength," Looping.."
          csLines = []
          csSeqs = []
          for order in range(2):
          #order = stopLength%2
            self.__csInstance.updateHandler(CSData[index][0], CSData[index][1], CSData[index][2], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            if csLine != -1:
              csLines.append(csLine)
              csSeqs.append(csReturn[1])
          if len(csLines) == 2:
            csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
            self.__Tree.updateTree(self.__parL1[index])
            pureLine1 = self.__Tree.wordTags()
            pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
            self.__Tree.updateTree(self.__parL2[index])
            pureLine2 = self.__Tree.wordTags()
            pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
            pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
            if pureWords == csWords:
              p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
              p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
              pureData.append(p1)
              pureData.append(p2)
              cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
              csData.append(cs1)
              cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
              csData.append(cs2)
              self.__csHash.add(index)
              stopLength -= 1
          else:
            continue
          
          if stopLength <= 0:
            break
          
        if stopLength > 0:
          print tr, stopLength, "Training Break!!"
          dummy = raw_input()
        
        for csLine in csData:
          dataFile.write(csLine)
        if pureFlag:
          pureFlag = 0
          for pureLine in pureData:
            pureFile.write(pureLine)
          for pureLine in PUREData:
            pureFile.write(pureLine)
          pureFile.close()
        dataFile.close()
  
  def __genTrainDataDupStrict(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((cs1,cs2, p1, p2))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
    
  def __addLangTags(self, WordTags, lTag):
    wordTags = []
    for wt in WordTags:
      newWT = [i for i in wt]
      wordTags.append(newWT)
    for index in range(len(wordTags)):
      wordTags[index].append(lTag)
    return wordTags
  
  def __genPosMap(self):
    for i in open(self.__l1MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag

    for i in open(self.__l2MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag  
    
    self.__L1Tags = set()
    for line in open(self.__l1MapFile):
      tag = line.split()[0]
      self.__L1Tags.add(tag)
    for line in open(self.__l2MapFile):
      tag = line.split()[0]
      self.__L2Tags.add(tag)
    self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags])
  
  def __map2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      try:
        newLine[index][1] = self.__posMap[tag]
      except:
        newLine[index][1] = 'X'
    return newLine
  
  def __map2UniControl(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      lang = wordTagsLangs[index][2]
      try:
        newLine[index][1] = self.__posMap[tag]+'_'+lang
      except:
        newLine[index][1] = 'X'+'_'+lang
    return newLine
  
  def __genPhraseMap(self):
    phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping")
    for i in phraseMapFile:
      i = i.strip()
      self.__phraseMap[i.split()[0]].extend(i.split()[1].split(","))
    
  def generateData(self):
    ##for i in range(10):
      ##self.__fileSuffix = "."+str(i)
      #self.__genTrainDataDiverse()
      self.__genTrainDataDupStrict()