Example #1
0
 def __init__(self, outDir):
   sys.stderr.write("DataGenerator: Constructor\n")
   ## Languages and Order
   self.__LID = ["FR","EN"]
   self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/en-ptb.map"
   self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/fr-paris.map"
   ## Data containers
   self.__L1 = []
   self.__L2 = []
   self.__align = []
   self.__outputDir = outDir
   self.__posMap = {}
   self.__phraseMap = dd(list)
   self.__csInstance = CSHandler()
   self.__utils = Utils()
   ## Generation Variants
   self.__csVariants = [0,1,2]
   self.__tagsetVariants = ["",".uni"]
   self.__dataRange = range(52,1000,52)
   ##LID stuff
   self.__L1Tags = set()
   self.__L2Tags = set()
   self.__commonTags = set()
   ## Pre processing
   self.__genPosMap()
   self.__genPhraseMap()
   self.__csInstance.updatePhraseMap(self.__phraseMap)
Example #2
0
 def __init__(self, outDir):
   self.config = None
   self.__csHash = set()
   self.__outputDir = outDir
   self.__csInstance = CSHandler()
   self.__dataHandler = DataHandler()
   self.__utils = Utils()
   self.__Tree = Dependencytree()
   self.__fileSuffix = ""
   self.prepareConfig()
Example #3
0
 def __init__(self, outDir):
   sys.stderr.write("DataGenerator: Constructor\n")
   ## Languages and Order
   self.__LID = ["HI","EN"]
   self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
   self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
   ## Data containers
   self.__parL1 = []
   self.__parL2 = []
   self.__align = []
   self.__pureL1 = []
   self.__pureL2 = []
   self.__outputDir = outDir
   self.__posMap = {}
   self.__phraseMap = dd(list)
   self.__csInstance = CSHandler()
   self.__utils = Utils()
   self.__Tree = Dependencytree()
   
   ## Generation Variants
   self.__csVariants = [0,1,2,3,4]
   self.__tagsetVariants = ["",".uni"]
   self.__dataRange = range(50,900,50)
   ##self.__dataRange = [200]
   self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
   self.__csHash = set()
   ##LID stuff
   self.__L1Tags = set()
   self.__L2Tags = set()
   self.__commonTags = set()
   ## Pre processing
   self.__genPosMap()
   self.__genPhraseMap()
   self.__csInstance.updatePhraseMap(self.__phraseMap)
   self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
   
   ## Real test overwrites
   #self.__csVariants = [1,2,3,4]
   self.__tagsetVariants = [""]
   self.__dataRange = [400]
   self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
   #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
   #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
   #self.__splits = [(50,50)]
   #for i in range(0,51,5):
   #  split = (100-i, i)
   #  self.__splits.append(split)
   self.__fileSuffix = ""
Example #4
0
class Generator:
  def __init__(self, outDir):
    self.config = None
    self.__csHash = set()
    self.__outputDir = outDir
    self.__csInstance = CSHandler()
    self.__dataHandler = DataHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    self.__fileSuffix = ""
    self.prepareConfig()
    
  def prepareConfig(self):
    self.config = GeneratingConfig()
    self.config.setCSVariants([0, 1, 2, 3, 4])
    self.config.setDataRanges({0:range(50, 1001, 50), 1:range(50, 1001, 50), 2:range(50, 1001, 50), 3:range(50, 1001, 50), 4:range(50, 1001, 50)})
    self.config.setSplits([(50, 50), (60, 40), (70, 30), (80, 20), (90, 10)])
    self.config.setTagsetVariants([".uniq", ".uni"])
  
  def prepareGenerator(self):
    self.__csInstance.updateLIDTags(self.__dataHandler.LID[0], self.__dataHandler.LID[1])
  
  def prepareRealTest(self, dataFile, outFile):
    dataFile = open(dataFile)
    outFile = open(outFile, 'w')
    for line in dataFile:
      line = map(lambda x:x.split('_#'), line.strip().split())
      uniLine = self.__dataHandler.mapLD2Uni(line)
      outFile.write(' '.join(map(lambda x:'_#'.join(x), uniLine)) + '\n')
    outFile.close()

  def generateTestData(self):
    self.config.setDataRanges({0:range(30, 151, 50), 1:range(30, 151, 50), 2:range(30, 151, 50), 3:range(30, 151, 50), 4:range(30, 151, 50)})
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Testing Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()
  
  
  def generateDataForTest(self):
    for i in range(10):
      self.__fileSuffix = "."+str(i)
      self.generateTrainDataForTest()
  
  def generateTrainDataForTest(self):
    self.config.setDataRanges({0:[450], 1:[450], 2:[450], 3:[450], 4:[450]})
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType),
      for data in self.config.dataRanges[csType]:
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          if splitIndex == len(self.config.splits) - 1:
            print
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.. ",
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  
  def generateTrainData(self):
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def generateUCTrainData(self): # Unknown words constrained training data
    statusCount = 0
    for csType in self.config.csVariants:
      for data in self.config.dataRanges[csType]:
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  def makeString(self, wordsTagsLangs):
    return ' '.join(map(lambda x:"_#".join(x), wordsTagsLangs)) + '\n'
    
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__dataHandler.loadData(l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data)
Example #5
0
class DataGenerator:
  def __init__(self, outDir):
    sys.stderr.write("DataGenerator: Constructor\n")
    ## Languages and Order
    self.__LID = ["FR","EN"]
    self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/en-ptb.map"
    self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/fr-paris.map"
    ## Data containers
    self.__L1 = []
    self.__L2 = []
    self.__align = []
    self.__outputDir = outDir
    self.__posMap = {}
    self.__phraseMap = dd(list)
    self.__csInstance = CSHandler()
    self.__utils = Utils()
    ## Generation Variants
    self.__csVariants = [0,1,2]
    self.__tagsetVariants = ["",".uni"]
    self.__dataRange = range(52,1000,52)
    ##LID stuff
    self.__L1Tags = set()
    self.__L2Tags = set()
    self.__commonTags = set()
    ## Pre processing
    self.__genPosMap()
    self.__genPhraseMap()
    self.__csInstance.updatePhraseMap(self.__phraseMap)
 
  def loadData(self, l1Data, l2Data, aligns):
    self.__L1 = [l.strip() for l in open(l1Data)]
    self.__L2 = [l.strip() for l in open(l2Data)]
    self.__align = [l.strip() for l in open(aligns)]
  
  def __genTestData(self, testIndices):
    for csType in self.__csVariants:
      #for tag in self.__tagsetVariants:
      dataFile = open(self.__outputDir+"TestCSType"+str(csType),'w')
      dataFileUni = open(self.__outputDir+"TestCSType"+str(csType)+".uni",'w')
      stopLength = 5129
      for index in testIndices:
        order = stopLength%2
        self.__csInstance.updateHandler(self.__L1[index], self.__L2[index], self.__align[index], order)
        csReturn = self.__csInstance.csSentence(csType)
        csLine = csReturn[0]
        #csSequence = csReturn[1]
        if csLine != -1:
          stopLength -= 1
        else:
          continue
        self.__addLangTags(csLine)
        csLineUni = self.__map2Uni(csLine)
        dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
        dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
        if stopLength == 0:
          break
      dataFile.close()
      dataFileUni.close()
      if stopLength != 0:
        print "Test Break!!", 5129, stopLength
        dummy = raw_input()
  
  def __genTrainData(self, pureIndices, csIndices):
    statusCount = 0
    for data in self.__dataRange:
      pr = 0
      while 1:
      #for pr in range(3):
        if pr == 3:
          break
        pr= int(pr*1.0/2 * data)
        tr = data - pr
        pr = pr/2
        random.seed()
        pIndices = random.sample(pureIndices, pr)
        cIndices = random.sample(csIndices, tr*5)
        for csType in self.__csVariants:
          print csType
          # Debugging !!
          #switch = ""
          #############
          #for tag in self.__tagsetVariants:
            # Debugging !!
            #if switch == "yes":
            #    break
            ###################
            #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n")
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(tr)+"Pure"+str(pr*2),'w')
          dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(tr)+"Pure"+str(pr*2)+".uni",'w')
          for index in pIndices:
            l1Line = self.__utils.wordTags(self.__L1[index])
            l2Line = self.__utils.wordTags(self.__L2[index])
            self.__addLangTags(l1Line)
            self.__addLangTags(l2Line)
            l1LineUni = self.__map2Uni(l1Line)
            l2LineUni = self.__map2Uni(l2Line)
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), l1Line))+'\n')
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), l2Line))+'\n')
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), l1LineUni))+'\n')
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), l2LineUni))+'\n')
          stopLength = tr
          for index in cIndices:
            csLine = ""
            order = stopLength%2
            #print order
            self.__csInstance.updateHandler(self.__L1[index], self.__L2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            # Debugging !!                         
            #sys.stderr.write("Switch to another CS variant?? ")
            #switch = raw_input()
            #if switch == "yes":
            #    break
            ###############
            csLine = csReturn[0]
            #csSequence = csReturn[1]
            if csLine != -1:
              stopLength -= 1
            else:
              continue
            self.__addLangTags(csLine)
            csLineUni = self.__map2Uni(csLine)
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
            if stopLength == 0:
              break
          dataFile.close()
          dataFileUni.close()
          if stopLength != 0:
            print tr, stopLength, "Training Break!!"
            pr -= 1
            #dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
        pr += 1
    print statusCount
    
  def __addLangTags(self, wordTags):
    #print self.__L1Tags
    #print self.__L2Tags
    #print wordTags
    for index in range(len(wordTags)):
      tag = wordTags[index][1]
      lang = ""
      if tag in self.__commonTags:
        lang = "C"
      elif tag in self.__L1Tags:
        lang = self.__LID[0]
      elif tag in self.__L2Tags:
        lang = self.__LID[1]
      if lang == "":
        print "Something wrong with the tagsets in the function add_lang"
        dummy = raw_input()
      wordTags[index].append(lang)
  
  def __genPosMap(self):
    for i in open(self.__l1MapFile):
      i = i.strip()
      srcTag = [i.split()[0]]
      uniTag = i.split()[1]
      if srcTag[0].find('|') >= 0:
        srcTag = srcTag[0].split('|')
      for tag in srcTag:
        self.__posMap[tag] = uniTag
    for i in open(self.__l2MapFile):
      i = i.strip()
      srcTag = [i.split()[0]]
      uniTag = i.split()[1]
      if srcTag[0].find('|') >= 0:
        srcTag = srcTag[0].split('|')
      for tag in srcTag:
        self.__posMap[tag] = uniTag
        
    
    self.__L1Tags = set()
    for line in open(self.__l1MapFile):
      tags = line.split()[0].split('|')
      for tag in tags:
        self.__L1Tags.add(tag)
    for line in open(self.__l2MapFile):
      tags = line.split()[0].split('|')
      for tag in tags:
        self.__L2Tags.add(tag)
    self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags])
  
  def __map2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      try:
        newLine[index][1] = self.__posMap[tag]
      except:
        dummy = raw_input("Something wrong.. Couldn't find Uni Map\n")
    return newLine
  
  def __genPhraseMap(self):
    phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping")
    for i in phraseMapFile:
      i = i.strip()
      self.__phraseMap[i.split()[0]].extend(i.split()[1].split(","))
    
  def __randomSample(self):
    print "Random Sample Train"
    totalLines = 95129
    testLines = 15129
    testIndices = random.sample(range(totalLines),testLines)
    #print testIndices
    trainIndices = []
    for i in range(totalLines):
      if i not in testIndices:
        trainIndices.append(i)
    csIndices = random.sample(trainIndices, 6000)
    remaining = []
    for i in trainIndices:
      if i not in csIndices:
        remaining.append(i)
    pureIndices = random.sample(remaining,6000)
    return trainIndices, testIndices, pureIndices, csIndices

  def __getRanges(self, dataRanges):
    dataRanges = open(dataRanges)
    trainIndices = []
    testIndices = []
    pureIndices = []
    csIndices = []
    for i in dataRanges:
        if i.split(":")[0] == "trainIndices":
            indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",")
            for j in  indices:
                j = int(j)
                trainIndices.append(j)
        elif i.split(":")[0] == "testIndices":
            indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",")
            for j in  indices:
                j = int(j)
                testIndices.append(j)
        elif i.split(":")[0] == "pureIndices":
            indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",")
            for j in  indices:
                j = int(j)
                pureIndices.append(j)
        elif i.split(":")[0] == "csIndices":
            indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",")
            for j in  indices:
                j = int(j)
                csIndices.append(j)
    #print len(testIndices), len(trainIndices), len(pureIndices), len(csIndices)
    #************************************
    # Un-comment this when sampling again
    #************************************
    '''rangesFile = open(dataRanges,'w')
    rangesFile.write("trainIndices:"+str(trainIndices)+"\n")
    rangesFile.write("testIndices:"+str(testIndices)+"\n")
    rangesFile.write("pureIndices:"+str(pureIndices)+"\n")
    rangesFile.write("csIndices:"+str(csIndices)+"\n")
    rangesFile.close()
    return self.__randomSample()'''
    return trainIndices, testIndices, pureIndices, csIndices
    
  def generateData(self, dataRanges):    
    ranges = self.__getRanges(dataRanges)
    #unmappedEnglish = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/unmapped")
    #unmappedEnPhrases = [i.strip() for i in unmappedEnglish]
    #trainIndices = ranges[0]
    testIndices = ranges[1]
    pureIndices = ranges[2]
    csIndices = ranges[3]
    self.__genTrainData(pureIndices, csIndices)
    self.__genTestData(testIndices)
Example #6
0
class DataGenerator:
  def __init__(self, outDir):
    sys.stderr.write("DataGenerator: Constructor\n")
    ## Languages and Order
    self.__LID = ["HI","EN"]
    self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
    self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
    ## Data containers
    self.__parL1 = []
    self.__parL2 = []
    self.__align = []
    self.__pureL1 = []
    self.__pureL2 = []
    self.__outputDir = outDir
    self.__posMap = {}
    self.__phraseMap = dd(list)
    self.__csInstance = CSHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    
    ## Generation Variants
    self.__csVariants = [0,1,2,3,4]
    self.__tagsetVariants = ["",".uni"]
    self.__dataRange = range(50,900,50)
    ##self.__dataRange = [200]
    self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
    self.__csHash = set()
    ##LID stuff
    self.__L1Tags = set()
    self.__L2Tags = set()
    self.__commonTags = set()
    ## Pre processing
    self.__genPosMap()
    self.__genPhraseMap()
    self.__csInstance.updatePhraseMap(self.__phraseMap)
    self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
    
    ## Real test overwrites
    #self.__csVariants = [1,2,3,4]
    self.__tagsetVariants = [""]
    self.__dataRange = [400]
    self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
    #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
    #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
    #self.__splits = [(50,50)]
    #for i in range(0,51,5):
    #  split = (100-i, i)
    #  self.__splits.append(split)
    self.__fileSuffix = ""
 
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__parL1 = self.__utils.readSentences(l1Data)
    self.__parL2 = self.__utils.readSentences(l2Data)
    self.__align = self.__utils.readAligns(l1Aligns, l2Aligns)
    self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data)
    self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data)
    sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n")
    sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n")
    sys.stderr.write("align:"+str(len(self.__align))+"\n")
    sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n")
    sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n")
  
  def __genTrainData(self):
    statusCount = 0
    for data in self.__dataRange:
      #control = 0
      #while 1:
      for Split in self.__splits:
      #for control in range(3):
        #if control == 3:
        #  break
        #pr= int(control*1.0/2 * data)
        pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
        tr = data - pr
        pr = pr/2
        
        print pr
        random.seed()
        pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
        pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
        
        for csType in self.__csVariants:
          self.__csHash = set()
          ##sys.stderr.write("csType:"+str(csType)+'\n')
          # Debugging !!
          #switch = ""
          #############
          #for tag in self.__tagsetVariants:
            # Debugging !!
            #if switch == "yes":
            #    break
            ###################
            #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n")
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          ##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w')
          ##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w')
          ##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w')
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            #sys.stderr.write("L1 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            #sys.stderr.write("L2 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              index = 0
            csLine = ""
            order = stopLength%2
            #sys.stderr.write("order:"+str(order)+'\n')
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            # Debugging !!                         
            #sys.stderr.write("Switch to another CS variant?? ")
            #switch = raw_input()
            #if switch == "yes":
            #    break
            ###############
            csLine = csReturn[0]
            #csSequence = csReturn[1]
            #print csReturn[1]
            hashKey = (index, tuple(csReturn[1]))
            #print hashKey
            if csLine != -1 and hashKey not in self.__csHash:
              self.__csHash.add(hashKey)
              stopLength -= 1
            else:
              continue
            #sys.stderr.write("csLine:"+str(csLine)+'\n')
            #csLine = self.__addLangTags(csLine)
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLine)))+'\n')
            csLineUni = self.__map2Uni(csLine)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLineUni)))+'\n')'''
            
          dataFile.close()
          ##dataFileUni.close()
          ##dataFileUniq.close()
          ##dataFileUniUniq.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            #pr -= 1
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
        #pr += 1
    print statusCount
    
  def __genTrainDataDiverse(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        for Split in self.__splits:
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          pr = pr/2
          
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          
          ##for csType in self.__csVariants:
          self.__csHash = set()
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          
          #### Dangerous ####
          
          ##pIndicesL1 = []
          ##pIndicesL2 = []
          
          #### End of Dangerous ####
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLine = ""
            order = stopLength%2
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            hashKey = (index, tuple(csReturn[1]))
            if csLine != -1 and hashKey not in self.__csHash:
              #self.__csHash.add(hashKey)
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            
          dataFile.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def __genTrainDataDup(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          ##for csType in self.__csVariants:
          self.__csHash = set()
          stopLength = tr
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLines = []
            for order in range(2):
            #order = stopLength%2
              self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
              csReturn = self.__csInstance.csSentence(csType)
              csLine = csReturn[0]
              if csLine != -1:
                csLines.append(csLine)
            if len(csLines) == 2:
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n')
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n')
              ##if splitIndex == 0:
              self.__Tree.updateTree(self.__parL1[index])
              pureLine = self.__Tree.wordTags()
              pureLine = self.__addLangTags(pureLine, self.__LID[0])
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__Tree.updateTree(self.__parL2[index])
              pureLine = self.__Tree.wordTags()
              #print pureLine
              pureLine = self.__addLangTags(pureLine, self.__LID[1])
              #print pureLine
              #sys.exit(0)
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def collectData(self):
    statusCount = 0
    for csType in [4]:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          if splitIndex > 0:
            return initialSlitCSData
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((self.__parL1[index],self.__parL2[index], self.__align[index]))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
  def __genFromSingleData(self):
    dataset = self.collectData()
    CSData = [d for d in dataset if len(d)==3]
    PUREData = [d[1] for d in dataset if len(d)==1]
    pureFile = open(self.__outputDir+"Baseline"+self.__fileSuffix,'w')
    pureFlag = 1
    for csType in self.__csVariants:
      for splitIndex in range(len(self.__splits)):
        Split = self.__splits[splitIndex]
        dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(len(dataset))+self.__fileSuffix,'w')
        stopLength = len(CSData)/2
        csData = []
        pureData = []
        index = -1
        while 1:
          index += 1
          if index == len(self.__parL1):
            ##break
            index = 0
            print "Still:",stopLength," Looping.."
          csLines = []
          csSeqs = []
          for order in range(2):
          #order = stopLength%2
            self.__csInstance.updateHandler(CSData[index][0], CSData[index][1], CSData[index][2], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            if csLine != -1:
              csLines.append(csLine)
              csSeqs.append(csReturn[1])
          if len(csLines) == 2:
            csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
            self.__Tree.updateTree(self.__parL1[index])
            pureLine1 = self.__Tree.wordTags()
            pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
            self.__Tree.updateTree(self.__parL2[index])
            pureLine2 = self.__Tree.wordTags()
            pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
            pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
            if pureWords == csWords:
              p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
              p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
              pureData.append(p1)
              pureData.append(p2)
              cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
              csData.append(cs1)
              cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
              csData.append(cs2)
              self.__csHash.add(index)
              stopLength -= 1
          else:
            continue
          
          if stopLength <= 0:
            break
          
        if stopLength > 0:
          print tr, stopLength, "Training Break!!"
          dummy = raw_input()
        
        for csLine in csData:
          dataFile.write(csLine)
        if pureFlag:
          pureFlag = 0
          for pureLine in pureData:
            pureFile.write(pureLine)
          for pureLine in PUREData:
            pureFile.write(pureLine)
          pureFile.close()
        dataFile.close()
  
  def __genTrainDataDupStrict(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((cs1,cs2, p1, p2))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
    
  def __addLangTags(self, WordTags, lTag):
    wordTags = []
    for wt in WordTags:
      newWT = [i for i in wt]
      wordTags.append(newWT)
    for index in range(len(wordTags)):
      wordTags[index].append(lTag)
    return wordTags
  
  def __genPosMap(self):
    for i in open(self.__l1MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag

    for i in open(self.__l2MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag  
    
    self.__L1Tags = set()
    for line in open(self.__l1MapFile):
      tag = line.split()[0]
      self.__L1Tags.add(tag)
    for line in open(self.__l2MapFile):
      tag = line.split()[0]
      self.__L2Tags.add(tag)
    self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags])
  
  def __map2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      try:
        newLine[index][1] = self.__posMap[tag]
      except:
        newLine[index][1] = 'X'
    return newLine
  
  def __map2UniControl(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      lang = wordTagsLangs[index][2]
      try:
        newLine[index][1] = self.__posMap[tag]+'_'+lang
      except:
        newLine[index][1] = 'X'+'_'+lang
    return newLine
  
  def __genPhraseMap(self):
    phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping")
    for i in phraseMapFile:
      i = i.strip()
      self.__phraseMap[i.split()[0]].extend(i.split()[1].split(","))
    
  def generateData(self):
    ##for i in range(10):
      ##self.__fileSuffix = "."+str(i)
      #self.__genTrainDataDiverse()
      self.__genTrainDataDupStrict()