Python CSHandler Examples

Programming Language: Python

Namespace/Package Name: CSHandler

Class/Type: CSHandler

Examples at hotexamples.com: 6

Python CSHandler - 6 examples found. These are the top rated real world Python examples of CSHandler.CSHandler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

csSentence(3)

updateHandler(3)

updateLIDTags(2)

updatePhraseMap(2)

Example #1

0

Show file

File: DataGenerator.py Project: phanigadde/CSRelated

 def __init__(self, outDir):
   sys.stderr.write("DataGenerator: Constructor\n")
   ## Languages and Order
   self.__LID = ["FR","EN"]
   self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/en-ptb.map"
   self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/fr-paris.map"
   ## Data containers
   self.__L1 = []
   self.__L2 = []
   self.__align = []
   self.__outputDir = outDir
   self.__posMap = {}
   self.__phraseMap = dd(list)
   self.__csInstance = CSHandler()
   self.__utils = Utils()
   ## Generation Variants
   self.__csVariants = [0,1,2]
   self.__tagsetVariants = ["",".uni"]
   self.__dataRange = range(52,1000,52)
   ##LID stuff
   self.__L1Tags = set()
   self.__L2Tags = set()
   self.__commonTags = set()
   ## Pre processing
   self.__genPosMap()
   self.__genPhraseMap()
   self.__csInstance.updatePhraseMap(self.__phraseMap)

Example #2

0

Show file

File: DataGenerator.py Project: phanigadde/CSRelated

 def __init__(self, outDir):
   self.config = None
   self.__csHash = set()
   self.__outputDir = outDir
   self.__csInstance = CSHandler()
   self.__dataHandler = DataHandler()
   self.__utils = Utils()
   self.__Tree = Dependencytree()
   self.__fileSuffix = ""
   self.prepareConfig()

Example #3

0

Show file

File: DataGenerator.py Project: phanigadde/CSRelated

 def __init__(self, outDir):
   sys.stderr.write("DataGenerator: Constructor\n")
   ## Languages and Order
   self.__LID = ["HI","EN"]
   self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
   self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
   ## Data containers
   self.__parL1 = []
   self.__parL2 = []
   self.__align = []
   self.__pureL1 = []
   self.__pureL2 = []
   self.__outputDir = outDir
   self.__posMap = {}
   self.__phraseMap = dd(list)
   self.__csInstance = CSHandler()
   self.__utils = Utils()
   self.__Tree = Dependencytree()
   
   ## Generation Variants
   self.__csVariants = [0,1,2,3,4]
   self.__tagsetVariants = ["",".uni"]
   self.__dataRange = range(50,900,50)
   ##self.__dataRange = [200]
   self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
   self.__csHash = set()
   ##LID stuff
   self.__L1Tags = set()
   self.__L2Tags = set()
   self.__commonTags = set()
   ## Pre processing
   self.__genPosMap()
   self.__genPhraseMap()
   self.__csInstance.updatePhraseMap(self.__phraseMap)
   self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
   
   ## Real test overwrites
   #self.__csVariants = [1,2,3,4]
   self.__tagsetVariants = [""]
   self.__dataRange = [400]
   self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
   #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
   #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
   #self.__splits = [(50,50)]
   #for i in range(0,51,5):
   #  split = (100-i, i)
   #  self.__splits.append(split)
   self.__fileSuffix = ""

Example #4

0

Show file

File: DataGenerator.py Project: phanigadde/CSRelated

class Generator:
  def __init__(self, outDir):
    self.config = None
    self.__csHash = set()
    self.__outputDir = outDir
    self.__csInstance = CSHandler()
    self.__dataHandler = DataHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    self.__fileSuffix = ""
    self.prepareConfig()
    
  def prepareConfig(self):
    self.config = GeneratingConfig()
    self.config.setCSVariants([0, 1, 2, 3, 4])
    self.config.setDataRanges({0:range(50, 1001, 50), 1:range(50, 1001, 50), 2:range(50, 1001, 50), 3:range(50, 1001, 50), 4:range(50, 1001, 50)})
    self.config.setSplits([(50, 50), (60, 40), (70, 30), (80, 20), (90, 10)])
    self.config.setTagsetVariants([".uniq", ".uni"])
  
  def prepareGenerator(self):
    self.__csInstance.updateLIDTags(self.__dataHandler.LID[0], self.__dataHandler.LID[1])
  
  def prepareRealTest(self, dataFile, outFile):
    dataFile = open(dataFile)
    outFile = open(outFile, 'w')
    for line in dataFile:
      line = map(lambda x:x.split('_#'), line.strip().split())
      uniLine = self.__dataHandler.mapLD2Uni(line)
      outFile.write(' '.join(map(lambda x:'_#'.join(x), uniLine)) + '\n')
    outFile.close()

  def generateTestData(self):
    self.config.setDataRanges({0:range(30, 151, 50), 1:range(30, 151, 50), 2:range(30, 151, 50), 3:range(30, 151, 50), 4:range(30, 151, 50)})
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Testing Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()
  
  
  def generateDataForTest(self):
    for i in range(10):
      self.__fileSuffix = "."+str(i)
      self.generateTrainDataForTest()
  
  def generateTrainDataForTest(self):
    self.config.setDataRanges({0:[450], 1:[450], 2:[450], 3:[450], 4:[450]})
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType),
      for data in self.config.dataRanges[csType]:
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          if splitIndex == len(self.config.splits) - 1:
            print
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.. ",
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  
  def generateTrainData(self):
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def generateUCTrainData(self): # Unknown words constrained training data
    statusCount = 0
    for csType in self.config.csVariants:
      for data in self.config.dataRanges[csType]:
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  def makeString(self, wordsTagsLangs):
    return ' '.join(map(lambda x:"_#".join(x), wordsTagsLangs)) + '\n'
    
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__dataHandler.loadData(l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data)

Example #5

0

Show file

File: DataGenerator.py Project: phanigadde/CSRelated

class DataGenerator:
  def __init__(self, outDir):
    sys.stderr.write("DataGenerator: Constructor\n")
    ## Languages and Order
    self.__LID = ["FR","EN"]
    self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/en-ptb.map"
    self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/fr-paris.map"
    ## Data containers
    self.__L1 = []
    self.__L2 = []
    self.__align = []
    self.__outputDir = outDir
    self.__posMap = {}
    self.__phraseMap = dd(list)
    self.__csInstance = CSHandler()
    self.__utils = Utils()
    ## Generation Variants
    self.__csVariants = [0,1,2]
    self.__tagsetVariants = ["",".uni"]
    self.__dataRange = range(52,1000,52)
    ##LID stuff
    self.__L1Tags = set()
    self.__L2Tags = set()
    self.__commonTags = set()
    ## Pre processing
    self.__genPosMap()
    self.__genPhraseMap()
    self.__csInstance.updatePhraseMap(self.__phraseMap)
 
  def loadData(self, l1Data, l2Data, aligns):
    self.__L1 = [l.strip() for l in open(l1Data)]
    self.__L2 = [l.strip() for l in open(l2Data)]
    self.__align = [l.strip() for l in open(aligns)]
  
  def __genTestData(self, testIndices):
    for csType in self.__csVariants:
      #for tag in self.__tagsetVariants:
      dataFile = open(self.__outputDir+"TestCSType"+str(csType),'w')
      dataFileUni = open(self.__outputDir+"TestCSType"+str(csType)+".uni",'w')
      stopLength = 5129
      for index in testIndices:
        order = stopLength%2
        self.__csInstance.updateHandler(self.__L1[index], self.__L2[index], self.__align[index], order)
        csReturn = self.__csInstance.csSentence(csType)
        csLine = csReturn[0]
        #csSequence = csReturn[1]
        if csLine != -1:
          stopLength -= 1
        else:
          continue
        self.__addLangTags(csLine)
        csLineUni = self.__map2Uni(csLine)
        dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
        dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
        if stopLength == 0:
          break
      dataFile.close()
      dataFileUni.close()
      if stopLength != 0:
        print "Test Break!!", 5129, stopLength
        dummy = raw_input()
  
  def __genTrainData(self, pureIndices, csIndices):
    statusCount = 0
    for data in self.__dataRange:
      pr = 0
      while 1:
      #for pr in range(3):
        if pr == 3:
          break
        pr= int(pr*1.0/2 * data)
        tr = data - pr
        pr = pr/2
        random.seed()
        pIndices = random.sample(pureIndices, pr)
        cIndices = random.sample(csIndices, tr*5)
        for csType in self.__csVariants:
          print csType
          # Debugging !!
          #switch = ""
          #############
          #for tag in self.__tagsetVariants:
            # Debugging !!
            #if switch == "yes":
            #    break
            ###################
            #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n")
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(tr)+"Pure"+str(pr*2),'w')
          dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(tr)+"Pure"+str(pr*2)+".uni",'w')
          for index in pIndices:
            l1Line = self.__utils.wordTags(self.__L1[index])
            l2Line = self.__utils.wordTags(self.__L2[index])
            self.__addLangTags(l1Line)
            self.__addLangTags(l2Line)
            l1LineUni = self.__map2Uni(l1Line)
            l2LineUni = self.__map2Uni(l2Line)
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), l1Line))+'\n')
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), l2Line))+'\n')
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), l1LineUni))+'\n')
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), l2LineUni))+'\n')
          stopLength = tr
          for index in cIndices:
            csLine = ""
            order = stopLength%2
            #print order
            self.__csInstance.updateHandler(self.__L1[index], self.__L2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            # Debugging !!                         
            #sys.stderr.write("Switch to another CS variant?? ")
            #switch = raw_input()
            #if switch == "yes":
            #    break
            ###############
            csLine = csReturn[0]
            #csSequence = csReturn[1]
            if csLine != -1:
              stopLength -= 1
            else:
              continue
            self.__addLangTags(csLine)
            csLineUni = self.__map2Uni(csLine)
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
            if stopLength == 0:
              break
          dataFile.close()
          dataFileUni.close()
          if stopLength != 0:
            print tr, stopLength, "Training Break!!"
            pr -= 1
            #dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
        pr += 1
    print statusCount
    
  def __addLangTags(self, wordTags):
    #print self.__L1Tags
    #print self.__L2Tags
    #print wordTags
    for index in range(len(wordTags)):
      tag = wordTags[index][1]
      lang = ""
      if tag in self.__commonTags:
        lang = "C"
      elif tag in self.__L1Tags:
        lang = self.__LID[0]
      elif tag in self.__L2Tags:
        lang = self.__LID[1]
      if lang == "":
        print "Something wrong with the tagsets in the function add_lang"
        dummy = raw_input()
      wordTags[index].append(lang)
  
  def __genPosMap(self):
    for i in open(self.__l1MapFile):
      i = i.strip()
      srcTag = [i.split()[0]]
      uniTag = i.split()[1]
      if srcTag[0].find('|') >= 0:
        srcTag = srcTag[0].split('|')
      for tag in srcTag:
        self.__posMap[tag] = uniTag
    for i in open(self.__l2MapFile):
      i = i.strip()
      srcTag = [i.split()[0]]
      uniTag = i.split()[1]
      if srcTag[0].find('|') >= 0:
        srcTag = srcTag[0].split('|')
      for tag in srcTag:
        self.__posMap[tag] = uniTag
        
    
    self.__L1Tags = set()
    for line in open(self.__l1MapFile):
      tags = line.split()[0].split('|')
      for tag in tags:
        self.__L1Tags.add(tag)
    for line in open(self.__l2MapFile):
      tags = line.split()[0].split('|')
      for tag in tags:
        self.__L2Tags.add(tag)
    self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags])
  
  def __map2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      try:
        newLine[index][1] = self.__posMap[tag]
      except:
        dummy = raw_input("Something wrong.. Couldn't find Uni Map\n")
    return newLine
  
  def __genPhraseMap(self):
    phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping")
    for i in phraseMapFile:
      i = i.strip()
      self.__phraseMap[i.split()[0]].extend(i.split()[1].split(","))
    
  def __randomSample(self):
    print "Random Sample Train"
    totalLines = 95129
    testLines = 15129
    testIndices = random.sample(range(totalLines),testLines)
    #print testIndices
    trainIndices = []
    for i in range(totalLines):
      if i not in testIndices:
        trainIndices.append(i)
    csIndices = random.sample(trainIndices, 6000)
    remaining = []
    for i in trainIndices:
      if i not in csIndices:
        remaining.append(i)
    pureIndices = random.sample(remaining,6000)
    return trainIndices, testIndices, pureIndices, csIndices

  def __getRanges(self, dataRanges):
    dataRanges = open(dataRanges)
    trainIndices = []
    testIndices = []
    pureIndices = []
    csIndices = []
    for i in dataRanges:
        if i.split(":")[0] == "trainIndices":
            indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",")
            for j in  indices:
                j = int(j)
                trainIndices.append(j)
        elif i.split(":")[0] == "testIndices":
            indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",")
            for j in  indices:
                j = int(j)
                testIndices.append(j)
        elif i.split(":")[0] == "pureIndices":
            indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",")
            for j in  indices:
                j = int(j)
                pureIndices.append(j)
        elif i.split(":")[0] == "csIndices":
            indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",")
            for j in  indices:
                j = int(j)
                csIndices.append(j)
    #print len(testIndices), len(trainIndices), len(pureIndices), len(csIndices)
    #************************************
    # Un-comment this when sampling again
    #************************************
    '''rangesFile = open(dataRanges,'w')
    rangesFile.write("trainIndices:"+str(trainIndices)+"\n")
    rangesFile.write("testIndices:"+str(testIndices)+"\n")
    rangesFile.write("pureIndices:"+str(pureIndices)+"\n")
    rangesFile.write("csIndices:"+str(csIndices)+"\n")
    rangesFile.close()
    return self.__randomSample()'''
    return trainIndices, testIndices, pureIndices, csIndices
    
  def generateData(self, dataRanges):    
    ranges = self.__getRanges(dataRanges)
    #unmappedEnglish = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/unmapped")
    #unmappedEnPhrases = [i.strip() for i in unmappedEnglish]
    #trainIndices = ranges[0]
    testIndices = ranges[1]
    pureIndices = ranges[2]
    csIndices = ranges[3]
    self.__genTrainData(pureIndices, csIndices)
    self.__genTestData(testIndices)

Example #6

0

Show file

File: DataGenerator.py Project: phanigadde/CSRelated

class DataGenerator:
  def __init__(self, outDir):
    sys.stderr.write("DataGenerator: Constructor\n")
    ## Languages and Order
    self.__LID = ["HI","EN"]
    self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
    self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
    ## Data containers
    self.__parL1 = []
    self.__parL2 = []
    self.__align = []
    self.__pureL1 = []
    self.__pureL2 = []
    self.__outputDir = outDir
    self.__posMap = {}
    self.__phraseMap = dd(list)
    self.__csInstance = CSHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    
    ## Generation Variants
    self.__csVariants = [0,1,2,3,4]
    self.__tagsetVariants = ["",".uni"]
    self.__dataRange = range(50,900,50)
    ##self.__dataRange = [200]
    self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
    self.__csHash = set()
    ##LID stuff
    self.__L1Tags = set()
    self.__L2Tags = set()
    self.__commonTags = set()
    ## Pre processing
    self.__genPosMap()
    self.__genPhraseMap()
    self.__csInstance.updatePhraseMap(self.__phraseMap)
    self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
    
    ## Real test overwrites
    #self.__csVariants = [1,2,3,4]
    self.__tagsetVariants = [""]
    self.__dataRange = [400]
    self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
    #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
    #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
    #self.__splits = [(50,50)]
    #for i in range(0,51,5):
    #  split = (100-i, i)
    #  self.__splits.append(split)
    self.__fileSuffix = ""
 
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__parL1 = self.__utils.readSentences(l1Data)
    self.__parL2 = self.__utils.readSentences(l2Data)
    self.__align = self.__utils.readAligns(l1Aligns, l2Aligns)
    self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data)
    self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data)
    sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n")
    sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n")
    sys.stderr.write("align:"+str(len(self.__align))+"\n")
    sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n")
    sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n")
  
  def __genTrainData(self):
    statusCount = 0
    for data in self.__dataRange:
      #control = 0
      #while 1:
      for Split in self.__splits:
      #for control in range(3):
        #if control == 3:
        #  break
        #pr= int(control*1.0/2 * data)
        pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
        tr = data - pr
        pr = pr/2
        
        print pr
        random.seed()
        pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
        pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
        
        for csType in self.__csVariants:
          self.__csHash = set()
          ##sys.stderr.write("csType:"+str(csType)+'\n')
          # Debugging !!
          #switch = ""
          #############
          #for tag in self.__tagsetVariants:
            # Debugging !!
            #if switch == "yes":
            #    break
            ###################
            #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n")
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          ##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w')
          ##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w')
          ##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w')
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            #sys.stderr.write("L1 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            #sys.stderr.write("L2 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              index = 0
            csLine = ""
            order = stopLength%2
            #sys.stderr.write("order:"+str(order)+'\n')
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            # Debugging !!                         
            #sys.stderr.write("Switch to another CS variant?? ")
            #switch = raw_input()
            #if switch == "yes":
            #    break
            ###############
            csLine = csReturn[0]
            #csSequence = csReturn[1]
            #print csReturn[1]
            hashKey = (index, tuple(csReturn[1]))
            #print hashKey
            if csLine != -1 and hashKey not in self.__csHash:
              self.__csHash.add(hashKey)
              stopLength -= 1
            else:
              continue
            #sys.stderr.write("csLine:"+str(csLine)+'\n')
            #csLine = self.__addLangTags(csLine)
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLine)))+'\n')
            csLineUni = self.__map2Uni(csLine)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLineUni)))+'\n')'''
            
          dataFile.close()
          ##dataFileUni.close()
          ##dataFileUniq.close()
          ##dataFileUniUniq.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            #pr -= 1
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
        #pr += 1
    print statusCount
    
  def __genTrainDataDiverse(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        for Split in self.__splits:
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          pr = pr/2
          
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          
          ##for csType in self.__csVariants:
          self.__csHash = set()
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          
          #### Dangerous ####
          
          ##pIndicesL1 = []
          ##pIndicesL2 = []
          
          #### End of Dangerous ####
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLine = ""
            order = stopLength%2
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            hashKey = (index, tuple(csReturn[1]))
            if csLine != -1 and hashKey not in self.__csHash:
              #self.__csHash.add(hashKey)
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            
          dataFile.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def __genTrainDataDup(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          ##for csType in self.__csVariants:
          self.__csHash = set()
          stopLength = tr
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLines = []
            for order in range(2):
            #order = stopLength%2
              self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
              csReturn = self.__csInstance.csSentence(csType)
              csLine = csReturn[0]
              if csLine != -1:
                csLines.append(csLine)
            if len(csLines) == 2:
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n')
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n')
              ##if splitIndex == 0:
              self.__Tree.updateTree(self.__parL1[index])
              pureLine = self.__Tree.wordTags()
              pureLine = self.__addLangTags(pureLine, self.__LID[0])
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__Tree.updateTree(self.__parL2[index])
              pureLine = self.__Tree.wordTags()
              #print pureLine
              pureLine = self.__addLangTags(pureLine, self.__LID[1])
              #print pureLine
              #sys.exit(0)
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def collectData(self):
    statusCount = 0
    for csType in [4]:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          if splitIndex > 0:
            return initialSlitCSData
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((self.__parL1[index],self.__parL2[index], self.__align[index]))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
  def __genFromSingleData(self):
    dataset = self.collectData()
    CSData = [d for d in dataset if len(d)==3]
    PUREData = [d[1] for d in dataset if len(d)==1]
    pureFile = open(self.__outputDir+"Baseline"+self.__fileSuffix,'w')
    pureFlag = 1
    for csType in self.__csVariants:
      for splitIndex in range(len(self.__splits)):
        Split = self.__splits[splitIndex]
        dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(len(dataset))+self.__fileSuffix,'w')
        stopLength = len(CSData)/2
        csData = []
        pureData = []
        index = -1
        while 1:
          index += 1
          if index == len(self.__parL1):
            ##break
            index = 0
            print "Still:",stopLength," Looping.."
          csLines = []
          csSeqs = []
          for order in range(2):
          #order = stopLength%2
            self.__csInstance.updateHandler(CSData[index][0], CSData[index][1], CSData[index][2], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            if csLine != -1:
              csLines.append(csLine)
              csSeqs.append(csReturn[1])
          if len(csLines) == 2:
            csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
            self.__Tree.updateTree(self.__parL1[index])
            pureLine1 = self.__Tree.wordTags()
            pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
            self.__Tree.updateTree(self.__parL2[index])
            pureLine2 = self.__Tree.wordTags()
            pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
            pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
            if pureWords == csWords:
              p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
              p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
              pureData.append(p1)
              pureData.append(p2)
              cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
              csData.append(cs1)
              cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
              csData.append(cs2)
              self.__csHash.add(index)
              stopLength -= 1
          else:
            continue
          
          if stopLength <= 0:
            break
          
        if stopLength > 0:
          print tr, stopLength, "Training Break!!"
          dummy = raw_input()
        
        for csLine in csData:
          dataFile.write(csLine)
        if pureFlag:
          pureFlag = 0
          for pureLine in pureData:
            pureFile.write(pureLine)
          for pureLine in PUREData:
            pureFile.write(pureLine)
          pureFile.close()
        dataFile.close()
  
  def __genTrainDataDupStrict(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((cs1,cs2, p1, p2))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
    
  def __addLangTags(self, WordTags, lTag):
    wordTags = []
    for wt in WordTags:
      newWT = [i for i in wt]
      wordTags.append(newWT)
    for index in range(len(wordTags)):
      wordTags[index].append(lTag)
    return wordTags
  
  def __genPosMap(self):
    for i in open(self.__l1MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag

    for i in open(self.__l2MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag  
    
    self.__L1Tags = set()
    for line in open(self.__l1MapFile):
      tag = line.split()[0]
      self.__L1Tags.add(tag)
    for line in open(self.__l2MapFile):
      tag = line.split()[0]
      self.__L2Tags.add(tag)
    self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags])
  
  def __map2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      try:
        newLine[index][1] = self.__posMap[tag]
      except:
        newLine[index][1] = 'X'
    return newLine
  
  def __map2UniControl(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      lang = wordTagsLangs[index][2]
      try:
        newLine[index][1] = self.__posMap[tag]+'_'+lang
      except:
        newLine[index][1] = 'X'+'_'+lang
    return newLine
  
  def __genPhraseMap(self):
    phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping")
    for i in phraseMapFile:
      i = i.strip()
      self.__phraseMap[i.split()[0]].extend(i.split()[1].split(","))
    
  def generateData(self):
    ##for i in range(10):
      ##self.__fileSuffix = "."+str(i)
      #self.__genTrainDataDiverse()
      self.__genTrainDataDupStrict()