def __init__(self): #self.__L1Tree = Parsetree() #self.__L2Tree = Parsetree() self.__L1 = "" self.__L2 = "" self.__curL1 = "" self.__curL2 = "" self.__L1Tree = Dependencytree() self.__L2Tree = Dependencytree() self.__align = {} self.__revAlign = {} self.__utils = Utils() self.__phraseMap = {} self.__l1Index = 0 self.__clausalChunks = ["CCP","VGF", "NULL__CCP","NULL__VGF"]
def __init__(self, outDir): self.config = None self.__csHash = set() self.__outputDir = outDir self.__csInstance = CSHandler() self.__dataHandler = DataHandler() self.__utils = Utils() self.__Tree = Dependencytree() self.__fileSuffix = "" self.prepareConfig()
def __init__(self, outDir): sys.stderr.write("DataGenerator: Constructor\n") ## Languages and Order self.__LID = ["HI","EN"] self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map" self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map" ## Data containers self.__parL1 = [] self.__parL2 = [] self.__align = [] self.__pureL1 = [] self.__pureL2 = [] self.__outputDir = outDir self.__posMap = {} self.__phraseMap = dd(list) self.__csInstance = CSHandler() self.__utils = Utils() self.__Tree = Dependencytree() ## Generation Variants self.__csVariants = [0,1,2,3,4] self.__tagsetVariants = ["",".uni"] self.__dataRange = range(50,900,50) ##self.__dataRange = [200] self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)] self.__csHash = set() ##LID stuff self.__L1Tags = set() self.__L2Tags = set() self.__commonTags = set() ## Pre processing self.__genPosMap() self.__genPhraseMap() self.__csInstance.updatePhraseMap(self.__phraseMap) self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1]) ## Real test overwrites #self.__csVariants = [1,2,3,4] self.__tagsetVariants = [""] self.__dataRange = [400] self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)} #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]} #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]} #self.__splits = [(50,50)] #for i in range(0,51,5): # split = (100-i, i) # self.__splits.append(split) self.__fileSuffix = ""
class Generator: def __init__(self, outDir): self.config = None self.__csHash = set() self.__outputDir = outDir self.__csInstance = CSHandler() self.__dataHandler = DataHandler() self.__utils = Utils() self.__Tree = Dependencytree() self.__fileSuffix = "" self.prepareConfig() def prepareConfig(self): self.config = GeneratingConfig() self.config.setCSVariants([0, 1, 2, 3, 4]) self.config.setDataRanges({0:range(50, 1001, 50), 1:range(50, 1001, 50), 2:range(50, 1001, 50), 3:range(50, 1001, 50), 4:range(50, 1001, 50)}) self.config.setSplits([(50, 50), (60, 40), (70, 30), (80, 20), (90, 10)]) self.config.setTagsetVariants([".uniq", ".uni"]) def prepareGenerator(self): self.__csInstance.updateLIDTags(self.__dataHandler.LID[0], self.__dataHandler.LID[1]) def prepareRealTest(self, dataFile, outFile): dataFile = open(dataFile) outFile = open(outFile, 'w') for line in dataFile: line = map(lambda x:x.split('_#'), line.strip().split()) uniLine = self.__dataHandler.mapLD2Uni(line) outFile.write(' '.join(map(lambda x:'_#'.join(x), uniLine)) + '\n') outFile.close() def generateTestData(self): self.config.setDataRanges({0:range(30, 151, 50), 1:range(30, 151, 50), 2:range(30, 151, 50), 3:range(30, 151, 50), 4:range(30, 151, 50)}) for csType in self.config.csVariants: print "type" + str(csType) for data in self.config.dataRanges[csType]: print print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Testing Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() def generateDataForTest(self): for i in range(10): self.__fileSuffix = "."+str(i) self.generateTrainDataForTest() def generateTrainDataForTest(self): self.config.setDataRanges({0:[450], 1:[450], 2:[450], 3:[450], 4:[450]}) statusCount = 0 for csType in self.config.csVariants: print "type" + str(csType), for data in self.config.dataRanges[csType]: print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), if splitIndex == len(self.config.splits) - 1: print random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.. ", csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def generateTrainData(self): statusCount = 0 for csType in self.config.csVariants: print "type" + str(csType) for data in self.config.dataRanges[csType]: print print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def generateUCTrainData(self): # Unknown words constrained training data statusCount = 0 for csType in self.config.csVariants: for data in self.config.dataRanges[csType]: initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print pr random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def makeString(self, wordsTagsLangs): return ' '.join(map(lambda x:"_#".join(x), wordsTagsLangs)) + '\n' def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data): self.__dataHandler.loadData(l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data)
class CSHandler: def __init__(self): #self.__L1Tree = Parsetree() #self.__L2Tree = Parsetree() self.__L1 = "" self.__L2 = "" self.__curL1 = "" self.__curL2 = "" self.__L1Tree = Dependencytree() self.__L2Tree = Dependencytree() self.__align = {} self.__revAlign = {} self.__utils = Utils() self.__phraseMap = {} self.__l1Index = 0 self.__clausalChunks = ["CCP","VGF", "NULL__CCP","NULL__VGF"] def updatePhraseMap(self, phraseMap): self.__phraseMap = phraseMap def updateLIDTags(self, L1, L2): self.__L1 = L1 self.__L2 = L2 def updateHandler(self, l1Sent, l2Sent, alignLine, l1Index): #sys.stderr.write(alignLine+'\n') l2Index = 1-l1Index self.__l1Index = l1Index if l1Index: self.__curL1 = self.__L2 self.__curL2 = self.__L1 self.__L1Tree.updateTree(l2Sent) self.__L2Tree.updateTree(l1Sent) else: self.__curL1 = self.__L1 self.__curL2 = self.__L2 self.__L1Tree.updateTree(l1Sent) self.__L2Tree.updateTree(l2Sent) self.__align = self.__parseAlign(alignLine, l1Index, l2Index) self.__revAlign = self.__parseAlign(alignLine, l2Index, l1Index) self.__csHash = dd(lambda:set()) def updateBadSwitch(self, index, l1Switch, l2Switch): self.__csHash[index].add(l1Switch) self.__csHash[index].add(l2Switch) def __parseAlign(self, alignLine, l1Index, l2Index): align = {} for i in alignLine.split(): key = int(i.split("-")[l1Index]) value = int(i.split("-")[l2Index]) if key in align.keys(): align[key].append(value) else: align[key] = [value] return align def csSentence(self, csType): validSequences = self.__utils.validSequences(self.__L1Tree.sentLen()) # Debugging !! #sys.stderr.write("L1SeqL2Cont Valid Sequences:\n") #for sequence in validSequences: # sys.stderr.write(str(sequence)+"\n") #dummy=raw_input() ############### # Assumption that a sentence will have a single code switch. sequence = self.__selectSequence(validSequences, csType) # Debugging !! ##sys.stderr.write("Selected Sequence: "+str(sequence)+"\n") ##sys.stderr.write("Align:"+str(self.__align)+'\n') #sys.stderr.write(l1Parse) #sys.stderr.write(l2Parse) #dummy=raw_input() if sequence == -1: return [-1,[]] ##print "Sequence:", sequence csSentence = self.__utils.makeCSSentence(self.__L1Tree.wordTags(), sequence, self.__L2Tree.wordTags(), self.__align, self.__curL1, self.__curL2) return [csSentence,sequence] ## Assumptions: ## There is always a single code switch ## The selection among the valid candidate sequences is random def __selectSequence(self, validSequences, csType): if csType == 0: return self.__random(validSequences) elif csType == 1: return self.__seqL1SeqL2Cont(validSequences) elif csType == 2: return self.__seqL1Const() elif csType == 4: return self.__seqHindiClausal() elif csType == 3: return self.__seqL1ConstL2Const() def __checkHindiClause(self, sequence, OBJ): if frozenset(sequence) not in OBJ.heads: return False try: head = OBJ.heads[frozenset(sequence)] #print "Head:", head if not OBJ.isChunkSubtree(head[1], sequence) and head[0] in self.__clausalChunks and not OBJ.hasNPChild(head[1]): return True except: pass return False def __seqHindiClausal(self): l1Sequence = -1 if self.__l1Index: ## Hindi is L2 subtrees = self.__L2Tree.subtrees if len(subtrees) == 0: return -1 count = 0 while 1: l2Sequence = list(random.sample(subtrees,1)[0]) l1Sequence = self.__utils.l2Sequence(l2Sequence, self.__revAlign) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue if len(l2Sequence) == self.__L2Tree.sentLen(): count += 1 continue #if (frozenset(l2Sequence) in self.__L2Tree.heads and self.__L2Tree.heads[frozenset(l2Sequence)] in self.__clausalChunks and len(l1Sequence) > 0 )or count > 100: if len(l2Sequence)>0 and len(l1Sequence)>0 and self.__checkHindiClause(l2Sequence, self.__L2Tree): #if len(l1Sequence) == 0: # print "l1Seq:",l1Sequence # print "subtrees:", subtrees # print "Count:", count # dummy = raw_input( return l1Sequence count += 1 if count > 100: break else: ## Hin is L1 subtrees = self.__L1Tree.subtrees if len(subtrees) == 0: return -1 count = 0 while 1: l1Sequence = list(random.sample(subtrees,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) == self.__L2Tree.sentLen(): count += 1 continue #if (frozenset(l1Sequence) in self.__L1Tree.heads and self.__L1Tree.heads[frozenset(l1Sequence)] in self.__clausalChunks and len(l2Sequence) > 0 )or count > 100: if len(l1Sequence)>0 and len(l2Sequence)>0 and self.__checkHindiClause(l1Sequence, self.__L1Tree): return l1Sequence count += 1 if count > 100: break return -1 def __random(self, sequences): random.seed() l1Sequence = -1 count = 0 while 1: l1Sequence = random.sample(sequences, 1)[0] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) > 0 or count > 100: break count += 1 return l1Sequence def __randomStrict(self, sequences): random.seed() l1Sequence = -1 count = 0 while 1: l1Sequence = random.sample(sequences, 1)[0] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) > 0: return l1Sequence count += 1 if count > 100: break return -1 def __seqL1SeqL2Cont(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = -1 count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1SeqL2Cont"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L2Tree.isContiguous(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1Const(self): random.seed() subtrees = self.__L1Tree.subtrees l1Sequence = -1 if len(subtrees)>0: count = 0 while 1: l1Sequence = list(random.sample(subtrees,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) > 0: l1Sequence.sort() return l1Sequence count += 1 if count > 100: break return -1 def __seqL1LWG(self): random.seed() LWGs = self.__L1Tree.LWGs() l1Sequence = -1 if len(LWGs)>0: count = 0 while 1: l1Sequence = list(random.sample(LWGs,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) > 0: l1Sequence.sort() return l1Sequence count += 1 if count > 100: break return -1 def __seqL1ConstL2Const(self): random.seed() subtrees = self.__L1Tree.subtrees l1Sequence = -1 if len(subtrees)>0: count = 0 while 1: l1Sequence = list(random.sample(subtrees,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if frozenset(l2Sequence) in self.__L2Tree.subtrees: l1Sequence.sort() return l1Sequence else: ##print "L1 Sentence:",self.__L1Tree.word_tags() ##print "L2 Sentence:",self.__L2Tree.word_tags() ##print "L1 Sequence:",l1Sequence ##print "L2 Sequence:", l2Sequence ##dummy = raw_input() dummy = 1 count += 1 if count > 100: break return -1 def __seqL1LWGL2LWG(self): random.seed() LWGs = self.__L1Tree.LWGs() l1Sequence = -1 if len(LWGs)>0: count = 0 while 1: l1Sequence = list(random.sample(LWGs,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if frozenset(l2Sequence) in self.__L2Tree.LWGs(): l1Sequence.sort() return l1Sequence else: ##print "L1 Sentence:",self.__L1Tree.word_tags() ##print "L2 Sentence:",self.__L2Tree.word_tags() ##print "L1 Sequence:",l1Sequence ##print "L2 Sequence:", l2Sequence ##dummy = raw_input() dummy = 1 count += 1 if count > 100: break return -1 def __seqL1ConstL2Cont(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = -1 count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1Const2Cont"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isContiguous(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1ConstL2SameConst(self, sequences): random.seed() sequenceIndex = -1 l1Sequence =-1 count = 0 while 1: count += 1 if count%500 == 0: return -1 sys.stderr.write("L1Const2SameConst"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isConstituent(l2Sequence): l1PhraseTag = self.__L1Tree.getPhrase(l1Sequence[0]) l2PhraseTag = self.__L2Tree.getPhrase(l2Sequence[0]) ## Both the phrases are same, for dual structure principle if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]: # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n") ############## break return l1Sequence
class DataGenerator: def __init__(self, outDir): sys.stderr.write("DataGenerator: Constructor\n") ## Languages and Order self.__LID = ["HI","EN"] self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map" self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map" ## Data containers self.__parL1 = [] self.__parL2 = [] self.__align = [] self.__pureL1 = [] self.__pureL2 = [] self.__outputDir = outDir self.__posMap = {} self.__phraseMap = dd(list) self.__csInstance = CSHandler() self.__utils = Utils() self.__Tree = Dependencytree() ## Generation Variants self.__csVariants = [0,1,2,3,4] self.__tagsetVariants = ["",".uni"] self.__dataRange = range(50,900,50) ##self.__dataRange = [200] self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)] self.__csHash = set() ##LID stuff self.__L1Tags = set() self.__L2Tags = set() self.__commonTags = set() ## Pre processing self.__genPosMap() self.__genPhraseMap() self.__csInstance.updatePhraseMap(self.__phraseMap) self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1]) ## Real test overwrites #self.__csVariants = [1,2,3,4] self.__tagsetVariants = [""] self.__dataRange = [400] self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)} #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]} #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]} #self.__splits = [(50,50)] #for i in range(0,51,5): # split = (100-i, i) # self.__splits.append(split) self.__fileSuffix = "" def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data): self.__parL1 = self.__utils.readSentences(l1Data) self.__parL2 = self.__utils.readSentences(l2Data) self.__align = self.__utils.readAligns(l1Aligns, l2Aligns) self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data) self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data) sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n") sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n") sys.stderr.write("align:"+str(len(self.__align))+"\n") sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n") sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n") def __genTrainData(self): statusCount = 0 for data in self.__dataRange: #control = 0 #while 1: for Split in self.__splits: #for control in range(3): #if control == 3: # break #pr= int(control*1.0/2 * data) pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for csType in self.__csVariants: self.__csHash = set() ##sys.stderr.write("csType:"+str(csType)+'\n') # Debugging !! #switch = "" ############# #for tag in self.__tagsetVariants: # Debugging !! #if switch == "yes": # break ################### #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n") sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w') ##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w') ##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w') ##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w') for index in pIndicesL1: line = self.__pureL1[index] #sys.stderr.write("L1 Line:"+str(line)+'\n') line = self.__addLangTags(line, self.__LID[0]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n') lineUni = self.__map2Uni(line) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')''' for index in pIndicesL2: line = self.__pureL2[index] #sys.stderr.write("L2 Line:"+str(line)+'\n') line = self.__addLangTags(line, self.__LID[1]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n') lineUni = self.__map2Uni(line) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')''' stopLength = tr+1 index = -1 while 1: index += 1 if index == len(self.__parL1): index = 0 csLine = "" order = stopLength%2 #sys.stderr.write("order:"+str(order)+'\n') self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) # Debugging !! #sys.stderr.write("Switch to another CS variant?? ") #switch = raw_input() #if switch == "yes": # break ############### csLine = csReturn[0] #csSequence = csReturn[1] #print csReturn[1] hashKey = (index, tuple(csReturn[1])) #print hashKey if csLine != -1 and hashKey not in self.__csHash: self.__csHash.add(hashKey) stopLength -= 1 else: continue #sys.stderr.write("csLine:"+str(csLine)+'\n') #csLine = self.__addLangTags(csLine) if stopLength <= 0: break dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLine)))+'\n') csLineUni = self.__map2Uni(csLine) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLineUni)))+'\n')''' dataFile.close() ##dataFileUni.close() ##dataFileUniq.close() ##dataFileUniUniq.close() if stopLength > 0: print tr, stopLength, "Training Break!!" #pr -= 1 dummy = raw_input() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() #pr += 1 print statusCount def __genTrainDataDiverse(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: for Split in self.__splits: pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) ##for csType in self.__csVariants: self.__csHash = set() sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w') #### Dangerous #### ##pIndicesL1 = [] ##pIndicesL2 = [] #### End of Dangerous #### for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') stopLength = tr+1 index = -1 while 1: index += 1 if index == len(self.__parL1): break index = 0 csLine = "" order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] hashKey = (index, tuple(csReturn[1])) if csLine != -1 and hashKey not in self.__csHash: #self.__csHash.add(hashKey) self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n') dataFile.close() if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __genTrainDataDup(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: for splitIndex in range(len(self.__splits)): csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): break index = 0 csLines = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) if len(csLines) == 2: csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n') csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n') ##if splitIndex == 0: self.__Tree.updateTree(self.__parL1[index]) pureLine = self.__Tree.wordTags() pureLine = self.__addLangTags(pureLine, self.__LID[0]) pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n') self.__Tree.updateTree(self.__parL2[index]) pureLine = self.__Tree.wordTags() #print pureLine pureLine = self.__addLangTags(pureLine, self.__LID[1]) #print pureLine #sys.exit(0) pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n') self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def collectData(self): statusCount = 0 for csType in [4]: for data in self.__dataRanges[csType]: initialSlitCSData = [] for splitIndex in range(len(self.__splits)): if splitIndex > 0: return initialSlitCSData csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) initialSlitCSData.append((pureLine)) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) initialSlitCSData.append((pureLine)) if splitIndex != 0: random.seed() csSample = random.sample(initialSlitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) if splitIndex == 0: initialSlitCSData.append((self.__parL1[index],self.__parL2[index], self.__align[index])) self.__csHash.add(index) stopLength -= 1 ##else: ## l1Switch = (0, tuple(csSeqs[0])) ## l2Switch = (1, tuple(csSeqs[1])) ## self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __genFromSingleData(self): dataset = self.collectData() CSData = [d for d in dataset if len(d)==3] PUREData = [d[1] for d in dataset if len(d)==1] pureFile = open(self.__outputDir+"Baseline"+self.__fileSuffix,'w') pureFlag = 1 for csType in self.__csVariants: for splitIndex in range(len(self.__splits)): Split = self.__splits[splitIndex] dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(len(dataset))+self.__fileSuffix,'w') stopLength = len(CSData)/2 csData = [] pureData = [] index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(CSData[index][0], CSData[index][1], CSData[index][2], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) if pureFlag: pureFlag = 0 for pureLine in pureData: pureFile.write(pureLine) for pureLine in PUREData: pureFile.write(pureLine) pureFile.close() dataFile.close() def __genTrainDataDupStrict(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: initialSlitCSData = [] for splitIndex in range(len(self.__splits)): csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) if splitIndex != 0: random.seed() csSample = random.sample(initialSlitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) if splitIndex == 0: initialSlitCSData.append((cs1,cs2, p1, p2)) self.__csHash.add(index) stopLength -= 1 ##else: ## l1Switch = (0, tuple(csSeqs[0])) ## l2Switch = (1, tuple(csSeqs[1])) ## self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __addLangTags(self, WordTags, lTag): wordTags = [] for wt in WordTags: newWT = [i for i in wt] wordTags.append(newWT) for index in range(len(wordTags)): wordTags[index].append(lTag) return wordTags def __genPosMap(self): for i in open(self.__l1MapFile): i = i.strip() srcTag = i.split()[0] uniTag = i.split()[1] self.__posMap[srcTag] = uniTag for i in open(self.__l2MapFile): i = i.strip() srcTag = i.split()[0] uniTag = i.split()[1] self.__posMap[srcTag] = uniTag self.__L1Tags = set() for line in open(self.__l1MapFile): tag = line.split()[0] self.__L1Tags.add(tag) for line in open(self.__l2MapFile): tag = line.split()[0] self.__L2Tags.add(tag) self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags]) def __map2Uni(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] try: newLine[index][1] = self.__posMap[tag] except: newLine[index][1] = 'X' return newLine def __map2UniControl(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] lang = wordTagsLangs[index][2] try: newLine[index][1] = self.__posMap[tag]+'_'+lang except: newLine[index][1] = 'X'+'_'+lang return newLine def __genPhraseMap(self): phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping") for i in phraseMapFile: i = i.strip() self.__phraseMap[i.split()[0]].extend(i.split()[1].split(",")) def generateData(self): ##for i in range(10): ##self.__fileSuffix = "."+str(i) #self.__genTrainDataDiverse() self.__genTrainDataDupStrict()