class DataGenerator: def __init__(self, outDir): sys.stderr.write("DataGenerator: Constructor\n") ## Languages and Order self.__LID = ["FR","EN"] self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/en-ptb.map" self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E0/UniversalMapping/fr-paris.map" ## Data containers self.__L1 = [] self.__L2 = [] self.__align = [] self.__outputDir = outDir self.__posMap = {} self.__phraseMap = dd(list) self.__csInstance = CSHandler() self.__utils = Utils() ## Generation Variants self.__csVariants = [0,1,2] self.__tagsetVariants = ["",".uni"] self.__dataRange = range(52,1000,52) ##LID stuff self.__L1Tags = set() self.__L2Tags = set() self.__commonTags = set() ## Pre processing self.__genPosMap() self.__genPhraseMap() self.__csInstance.updatePhraseMap(self.__phraseMap) def loadData(self, l1Data, l2Data, aligns): self.__L1 = [l.strip() for l in open(l1Data)] self.__L2 = [l.strip() for l in open(l2Data)] self.__align = [l.strip() for l in open(aligns)] def __genTestData(self, testIndices): for csType in self.__csVariants: #for tag in self.__tagsetVariants: dataFile = open(self.__outputDir+"TestCSType"+str(csType),'w') dataFileUni = open(self.__outputDir+"TestCSType"+str(csType)+".uni",'w') stopLength = 5129 for index in testIndices: order = stopLength%2 self.__csInstance.updateHandler(self.__L1[index], self.__L2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] #csSequence = csReturn[1] if csLine != -1: stopLength -= 1 else: continue self.__addLangTags(csLine) csLineUni = self.__map2Uni(csLine) dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n') dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n') if stopLength == 0: break dataFile.close() dataFileUni.close() if stopLength != 0: print "Test Break!!", 5129, stopLength dummy = raw_input() def __genTrainData(self, pureIndices, csIndices): statusCount = 0 for data in self.__dataRange: pr = 0 while 1: #for pr in range(3): if pr == 3: break pr= int(pr*1.0/2 * data) tr = data - pr pr = pr/2 random.seed() pIndices = random.sample(pureIndices, pr) cIndices = random.sample(csIndices, tr*5) for csType in self.__csVariants: print csType # Debugging !! #switch = "" ############# #for tag in self.__tagsetVariants: # Debugging !! #if switch == "yes": # break ################### #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n") dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(tr)+"Pure"+str(pr*2),'w') dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(tr)+"Pure"+str(pr*2)+".uni",'w') for index in pIndices: l1Line = self.__utils.wordTags(self.__L1[index]) l2Line = self.__utils.wordTags(self.__L2[index]) self.__addLangTags(l1Line) self.__addLangTags(l2Line) l1LineUni = self.__map2Uni(l1Line) l2LineUni = self.__map2Uni(l2Line) dataFile.write(' '.join(map(lambda x:'_#'.join(x), l1Line))+'\n') dataFile.write(' '.join(map(lambda x:'_#'.join(x), l2Line))+'\n') dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), l1LineUni))+'\n') dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), l2LineUni))+'\n') stopLength = tr for index in cIndices: csLine = "" order = stopLength%2 #print order self.__csInstance.updateHandler(self.__L1[index], self.__L2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) # Debugging !! #sys.stderr.write("Switch to another CS variant?? ") #switch = raw_input() #if switch == "yes": # break ############### csLine = csReturn[0] #csSequence = csReturn[1] if csLine != -1: stopLength -= 1 else: continue self.__addLangTags(csLine) csLineUni = self.__map2Uni(csLine) dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n') dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n') if stopLength == 0: break dataFile.close() dataFileUni.close() if stopLength != 0: print tr, stopLength, "Training Break!!" pr -= 1 #dummy = raw_input() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() pr += 1 print statusCount def __addLangTags(self, wordTags): #print self.__L1Tags #print self.__L2Tags #print wordTags for index in range(len(wordTags)): tag = wordTags[index][1] lang = "" if tag in self.__commonTags: lang = "C" elif tag in self.__L1Tags: lang = self.__LID[0] elif tag in self.__L2Tags: lang = self.__LID[1] if lang == "": print "Something wrong with the tagsets in the function add_lang" dummy = raw_input() wordTags[index].append(lang) def __genPosMap(self): for i in open(self.__l1MapFile): i = i.strip() srcTag = [i.split()[0]] uniTag = i.split()[1] if srcTag[0].find('|') >= 0: srcTag = srcTag[0].split('|') for tag in srcTag: self.__posMap[tag] = uniTag for i in open(self.__l2MapFile): i = i.strip() srcTag = [i.split()[0]] uniTag = i.split()[1] if srcTag[0].find('|') >= 0: srcTag = srcTag[0].split('|') for tag in srcTag: self.__posMap[tag] = uniTag self.__L1Tags = set() for line in open(self.__l1MapFile): tags = line.split()[0].split('|') for tag in tags: self.__L1Tags.add(tag) for line in open(self.__l2MapFile): tags = line.split()[0].split('|') for tag in tags: self.__L2Tags.add(tag) self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags]) def __map2Uni(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] try: newLine[index][1] = self.__posMap[tag] except: dummy = raw_input("Something wrong.. Couldn't find Uni Map\n") return newLine def __genPhraseMap(self): phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping") for i in phraseMapFile: i = i.strip() self.__phraseMap[i.split()[0]].extend(i.split()[1].split(",")) def __randomSample(self): print "Random Sample Train" totalLines = 95129 testLines = 15129 testIndices = random.sample(range(totalLines),testLines) #print testIndices trainIndices = [] for i in range(totalLines): if i not in testIndices: trainIndices.append(i) csIndices = random.sample(trainIndices, 6000) remaining = [] for i in trainIndices: if i not in csIndices: remaining.append(i) pureIndices = random.sample(remaining,6000) return trainIndices, testIndices, pureIndices, csIndices def __getRanges(self, dataRanges): dataRanges = open(dataRanges) trainIndices = [] testIndices = [] pureIndices = [] csIndices = [] for i in dataRanges: if i.split(":")[0] == "trainIndices": indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",") for j in indices: j = int(j) trainIndices.append(j) elif i.split(":")[0] == "testIndices": indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",") for j in indices: j = int(j) testIndices.append(j) elif i.split(":")[0] == "pureIndices": indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",") for j in indices: j = int(j) pureIndices.append(j) elif i.split(":")[0] == "csIndices": indices = i.split(":")[1].strip("[]\n").replace(" ","").split(",") for j in indices: j = int(j) csIndices.append(j) #print len(testIndices), len(trainIndices), len(pureIndices), len(csIndices) #************************************ # Un-comment this when sampling again #************************************ '''rangesFile = open(dataRanges,'w') rangesFile.write("trainIndices:"+str(trainIndices)+"\n") rangesFile.write("testIndices:"+str(testIndices)+"\n") rangesFile.write("pureIndices:"+str(pureIndices)+"\n") rangesFile.write("csIndices:"+str(csIndices)+"\n") rangesFile.close() return self.__randomSample()''' return trainIndices, testIndices, pureIndices, csIndices def generateData(self, dataRanges): ranges = self.__getRanges(dataRanges) #unmappedEnglish = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/unmapped") #unmappedEnPhrases = [i.strip() for i in unmappedEnglish] #trainIndices = ranges[0] testIndices = ranges[1] pureIndices = ranges[2] csIndices = ranges[3] self.__genTrainData(pureIndices, csIndices) self.__genTestData(testIndices)
class Generator: def __init__(self, outDir): self.config = None self.__csHash = set() self.__outputDir = outDir self.__csInstance = CSHandler() self.__dataHandler = DataHandler() self.__utils = Utils() self.__Tree = Dependencytree() self.__fileSuffix = "" self.prepareConfig() def prepareConfig(self): self.config = GeneratingConfig() self.config.setCSVariants([0, 1, 2, 3, 4]) self.config.setDataRanges({0:range(50, 1001, 50), 1:range(50, 1001, 50), 2:range(50, 1001, 50), 3:range(50, 1001, 50), 4:range(50, 1001, 50)}) self.config.setSplits([(50, 50), (60, 40), (70, 30), (80, 20), (90, 10)]) self.config.setTagsetVariants([".uniq", ".uni"]) def prepareGenerator(self): self.__csInstance.updateLIDTags(self.__dataHandler.LID[0], self.__dataHandler.LID[1]) def prepareRealTest(self, dataFile, outFile): dataFile = open(dataFile) outFile = open(outFile, 'w') for line in dataFile: line = map(lambda x:x.split('_#'), line.strip().split()) uniLine = self.__dataHandler.mapLD2Uni(line) outFile.write(' '.join(map(lambda x:'_#'.join(x), uniLine)) + '\n') outFile.close() def generateTestData(self): self.config.setDataRanges({0:range(30, 151, 50), 1:range(30, 151, 50), 2:range(30, 151, 50), 3:range(30, 151, 50), 4:range(30, 151, 50)}) for csType in self.config.csVariants: print "type" + str(csType) for data in self.config.dataRanges[csType]: print print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Testing Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() def generateDataForTest(self): for i in range(10): self.__fileSuffix = "."+str(i) self.generateTrainDataForTest() def generateTrainDataForTest(self): self.config.setDataRanges({0:[450], 1:[450], 2:[450], 3:[450], 4:[450]}) statusCount = 0 for csType in self.config.csVariants: print "type" + str(csType), for data in self.config.dataRanges[csType]: print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), if splitIndex == len(self.config.splits) - 1: print random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.. ", csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def generateTrainData(self): statusCount = 0 for csType in self.config.csVariants: print "type" + str(csType) for data in self.config.dataRanges[csType]: print print " numSents:" + str(data * 2), initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print " Pure:" + str(2 * pr), print " CS:" + str(2 * tr), random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def generateUCTrainData(self): # Unknown words constrained training data statusCount = 0 for csType in self.config.csVariants: for data in self.config.dataRanges[csType]: initialSplitCSData = [] for splitIndex in range(len(self.config.splits)): csData = [] Split = self.config.splits[splitIndex] pureData = [] pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w') dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w') pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w') dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w') pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data) tr = data - pr print pr random.seed() pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr) pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr) for index in pIndicesL1: line = self.__dataHandler.pureL1[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) for index in pIndicesL2: line = self.__dataHandler.pureL2[index] line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1]) line = self.__dataHandler.makeLD(line) pureData.append(tuple(line)) csData.append(tuple(line)) if splitIndex != 0: random.seed() csSample = random.sample(initialSplitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__dataHandler.parL1): ##break index = 0 print "Still:", stopLength, " Looping.." csLines = [] csSeqs = [] hashKeys = ["", ""] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: hashKeys[order] = (index, order, tuple(csReturn[1])) csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__dataHandler.parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0]) pureLine1 = self.__dataHandler.makeLD(pureLine1) self.__Tree.updateTree(self.__dataHandler.parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1]) pureLine2 = self.__dataHandler.makeLD(pureLine2) pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2]) if pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash: pureData.append(tuple(pureLine1)) pureData.append(tuple(pureLine2)) csData.append(tuple(csLines[0])) csData.append(tuple(csLines[1])) if splitIndex == 0: initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2))) stopLength -= 1 for hashKey in hashKeys: self.__csHash.add(hashKey) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine))) dataFile.write(self.makeString(csLine)) for pureLine in pureData: pureFile.write(self.makeString(pureLine)) pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine))) pureFile.close() dataFile.close() pureUniFile.close() dataUniFile.close() statusCount += 1 if statusCount % 50 == 0: print statusCount, sys.stdout.flush() print statusCount def makeString(self, wordsTagsLangs): return ' '.join(map(lambda x:"_#".join(x), wordsTagsLangs)) + '\n' def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data): self.__dataHandler.loadData(l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data)
class DataGenerator: def __init__(self, outDir): sys.stderr.write("DataGenerator: Constructor\n") ## Languages and Order self.__LID = ["HI","EN"] self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map" self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map" ## Data containers self.__parL1 = [] self.__parL2 = [] self.__align = [] self.__pureL1 = [] self.__pureL2 = [] self.__outputDir = outDir self.__posMap = {} self.__phraseMap = dd(list) self.__csInstance = CSHandler() self.__utils = Utils() self.__Tree = Dependencytree() ## Generation Variants self.__csVariants = [0,1,2,3,4] self.__tagsetVariants = ["",".uni"] self.__dataRange = range(50,900,50) ##self.__dataRange = [200] self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)] self.__csHash = set() ##LID stuff self.__L1Tags = set() self.__L2Tags = set() self.__commonTags = set() ## Pre processing self.__genPosMap() self.__genPhraseMap() self.__csInstance.updatePhraseMap(self.__phraseMap) self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1]) ## Real test overwrites #self.__csVariants = [1,2,3,4] self.__tagsetVariants = [""] self.__dataRange = [400] self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)} #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]} #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]} #self.__splits = [(50,50)] #for i in range(0,51,5): # split = (100-i, i) # self.__splits.append(split) self.__fileSuffix = "" def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data): self.__parL1 = self.__utils.readSentences(l1Data) self.__parL2 = self.__utils.readSentences(l2Data) self.__align = self.__utils.readAligns(l1Aligns, l2Aligns) self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data) self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data) sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n") sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n") sys.stderr.write("align:"+str(len(self.__align))+"\n") sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n") sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n") def __genTrainData(self): statusCount = 0 for data in self.__dataRange: #control = 0 #while 1: for Split in self.__splits: #for control in range(3): #if control == 3: # break #pr= int(control*1.0/2 * data) pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for csType in self.__csVariants: self.__csHash = set() ##sys.stderr.write("csType:"+str(csType)+'\n') # Debugging !! #switch = "" ############# #for tag in self.__tagsetVariants: # Debugging !! #if switch == "yes": # break ################### #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n") sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w') ##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w') ##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w') ##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w') for index in pIndicesL1: line = self.__pureL1[index] #sys.stderr.write("L1 Line:"+str(line)+'\n') line = self.__addLangTags(line, self.__LID[0]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n') lineUni = self.__map2Uni(line) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')''' for index in pIndicesL2: line = self.__pureL2[index] #sys.stderr.write("L2 Line:"+str(line)+'\n') line = self.__addLangTags(line, self.__LID[1]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n') lineUni = self.__map2Uni(line) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')''' stopLength = tr+1 index = -1 while 1: index += 1 if index == len(self.__parL1): index = 0 csLine = "" order = stopLength%2 #sys.stderr.write("order:"+str(order)+'\n') self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) # Debugging !! #sys.stderr.write("Switch to another CS variant?? ") #switch = raw_input() #if switch == "yes": # break ############### csLine = csReturn[0] #csSequence = csReturn[1] #print csReturn[1] hashKey = (index, tuple(csReturn[1])) #print hashKey if csLine != -1 and hashKey not in self.__csHash: self.__csHash.add(hashKey) stopLength -= 1 else: continue #sys.stderr.write("csLine:"+str(csLine)+'\n') #csLine = self.__addLangTags(csLine) if stopLength <= 0: break dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLine)))+'\n') csLineUni = self.__map2Uni(csLine) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLineUni)))+'\n')''' dataFile.close() ##dataFileUni.close() ##dataFileUniq.close() ##dataFileUniUniq.close() if stopLength > 0: print tr, stopLength, "Training Break!!" #pr -= 1 dummy = raw_input() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() #pr += 1 print statusCount def __genTrainDataDiverse(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: for Split in self.__splits: pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) ##for csType in self.__csVariants: self.__csHash = set() sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w') #### Dangerous #### ##pIndicesL1 = [] ##pIndicesL2 = [] #### End of Dangerous #### for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') stopLength = tr+1 index = -1 while 1: index += 1 if index == len(self.__parL1): break index = 0 csLine = "" order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] hashKey = (index, tuple(csReturn[1])) if csLine != -1 and hashKey not in self.__csHash: #self.__csHash.add(hashKey) self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n') dataFile.close() if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __genTrainDataDup(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: for splitIndex in range(len(self.__splits)): csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): break index = 0 csLines = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) if len(csLines) == 2: csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n') csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n') ##if splitIndex == 0: self.__Tree.updateTree(self.__parL1[index]) pureLine = self.__Tree.wordTags() pureLine = self.__addLangTags(pureLine, self.__LID[0]) pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n') self.__Tree.updateTree(self.__parL2[index]) pureLine = self.__Tree.wordTags() #print pureLine pureLine = self.__addLangTags(pureLine, self.__LID[1]) #print pureLine #sys.exit(0) pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n') self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def collectData(self): statusCount = 0 for csType in [4]: for data in self.__dataRanges[csType]: initialSlitCSData = [] for splitIndex in range(len(self.__splits)): if splitIndex > 0: return initialSlitCSData csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) initialSlitCSData.append((pureLine)) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) initialSlitCSData.append((pureLine)) if splitIndex != 0: random.seed() csSample = random.sample(initialSlitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) if splitIndex == 0: initialSlitCSData.append((self.__parL1[index],self.__parL2[index], self.__align[index])) self.__csHash.add(index) stopLength -= 1 ##else: ## l1Switch = (0, tuple(csSeqs[0])) ## l2Switch = (1, tuple(csSeqs[1])) ## self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __genFromSingleData(self): dataset = self.collectData() CSData = [d for d in dataset if len(d)==3] PUREData = [d[1] for d in dataset if len(d)==1] pureFile = open(self.__outputDir+"Baseline"+self.__fileSuffix,'w') pureFlag = 1 for csType in self.__csVariants: for splitIndex in range(len(self.__splits)): Split = self.__splits[splitIndex] dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(len(dataset))+self.__fileSuffix,'w') stopLength = len(CSData)/2 csData = [] pureData = [] index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(CSData[index][0], CSData[index][1], CSData[index][2], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) if pureFlag: pureFlag = 0 for pureLine in pureData: pureFile.write(pureLine) for pureLine in PUREData: pureFile.write(pureLine) pureFile.close() dataFile.close() def __genTrainDataDupStrict(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: initialSlitCSData = [] for splitIndex in range(len(self.__splits)): csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) if splitIndex != 0: random.seed() csSample = random.sample(initialSlitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) if splitIndex == 0: initialSlitCSData.append((cs1,cs2, p1, p2)) self.__csHash.add(index) stopLength -= 1 ##else: ## l1Switch = (0, tuple(csSeqs[0])) ## l2Switch = (1, tuple(csSeqs[1])) ## self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __addLangTags(self, WordTags, lTag): wordTags = [] for wt in WordTags: newWT = [i for i in wt] wordTags.append(newWT) for index in range(len(wordTags)): wordTags[index].append(lTag) return wordTags def __genPosMap(self): for i in open(self.__l1MapFile): i = i.strip() srcTag = i.split()[0] uniTag = i.split()[1] self.__posMap[srcTag] = uniTag for i in open(self.__l2MapFile): i = i.strip() srcTag = i.split()[0] uniTag = i.split()[1] self.__posMap[srcTag] = uniTag self.__L1Tags = set() for line in open(self.__l1MapFile): tag = line.split()[0] self.__L1Tags.add(tag) for line in open(self.__l2MapFile): tag = line.split()[0] self.__L2Tags.add(tag) self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags]) def __map2Uni(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] try: newLine[index][1] = self.__posMap[tag] except: newLine[index][1] = 'X' return newLine def __map2UniControl(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] lang = wordTagsLangs[index][2] try: newLine[index][1] = self.__posMap[tag]+'_'+lang except: newLine[index][1] = 'X'+'_'+lang return newLine def __genPhraseMap(self): phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping") for i in phraseMapFile: i = i.strip() self.__phraseMap[i.split()[0]].extend(i.split()[1].split(",")) def generateData(self): ##for i in range(10): ##self.__fileSuffix = "."+str(i) #self.__genTrainDataDiverse() self.__genTrainDataDupStrict()