## Both the phrases are same, for dual structure principle if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]: # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n") ############## break return l1Sequence if __name__ == "__main__": l2Data = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/engParse.wx" l1Data = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/hinParse.wx" l2Aligns = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/engAlign.wx" l1Aligns = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/hinAlign.wx" pureL1 = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/HinPOS/hindiTrain.wx" pureL2 = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/EngPOS/train.0-18.tsv" U = Utils() parL1 = U.readSentences(l1Data) parL2 = U.readSentences(l2Data) align = U.readAligns(l1Aligns, l2Aligns) pureL1 = U.readSentencesPlain(pureL1) pureL2 = U.readSentencesPlain(pureL2) #print parL1[0]print parL2[0]print pureL1[0]print pureL2[0] print align[0] CS = CSHandler() CS.updateHandler(parL1[0], parL2[0], align[0], 0) print CS.csSentence(4)
class DataHandler: def __init__(self): ## Resources self.LID = ["HI", "EN"] self.l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/LastSem/Data/UniMaps/en-ptb.map" self.l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/LastSem/Data/UniMaps/hi-hyd.map" ## Containers self.parL1 = [] self.parL2 = [] self.align = [] self.pureL1 = [] self.pureL2 = [] self.L1Tags = set() self.L2Tags = set() self.commonTags = set() self.posMap = {} ## Pre-processing self.genPosMap() ## Others self.__utils = Utils() def addLangTags(self, WordTags, lTag): wordTags = [] for wt in WordTags: newWT = [i for i in wt] wordTags.append(newWT) for index in range(len(wordTags)): wordTags[index].append(lTag) return wordTags def makeLD(self, wordsTagsLangs): newLine = [] for index in range(len(wordsTagsLangs)): wordTagLang = copy.deepcopy(wordsTagsLangs[index]) wordTagLang[1] = wordTagLang[1] + '_' + wordTagLang[2] newLine.append(wordTagLang) return newLine def genPosMap(self): for i in open(self.l1MapFile): i = i.strip() srcTag = i.split()[0] uniTag = i.split()[1] self.posMap[srcTag] = uniTag for i in open(self.l2MapFile): i = i.strip() srcTag = i.split()[0] uniTag = i.split()[1] self.posMap[srcTag] = uniTag self.L1Tags = set() for line in open(self.l1MapFile): tag = line.split()[0] self.L1Tags.add(tag) for line in open(self.l2MapFile): tag = line.split()[0] self.L2Tags.add(tag) self.commonTags = set([c for c in self.L1Tags if c in self.L2Tags]) def map2Uni(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] try: newLine[index][1] = self.posMap[tag] except: newLine[index][1] = 'X' return newLine def mapLD2Uni(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): wordTagLang = copy.deepcopy(wordTagsLangs[index]) tag = wordTagLang[1].split("_")[0] lang = wordTagLang[2] try: wordTagLang[1] = self.posMap[tag] + "_" + lang except: wordTagLang[1] = 'X' + "_" + lang newLine.append(wordTagLang) return newLine def map2UniControl(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] lang = wordTagsLangs[index][2] try: newLine[index][1] = self.posMap[tag] + '_' + lang except: newLine[index][1] = 'X' + '_' + lang return newLine def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data): self.parL1 = self.__utils.readSentences(l1Data) self.parL2 = self.__utils.readSentences(l2Data) self.align = self.__utils.readAligns(l1Aligns, l2Aligns) self.pureL1 = self.__utils.readSentencesPlain(pureL1Data) self.pureL2 = self.__utils.readSentencesPlain(pureL2Data) sys.stderr.write("parL1:" + str(len(self.parL1)) + "\n") sys.stderr.write("parL2:" + str(len(self.parL2)) + "\n") sys.stderr.write("align:" + str(len(self.align)) + "\n") sys.stderr.write("pureL1:" + str(len(self.pureL1)) + "\n") sys.stderr.write("pureL2:" + str(len(self.pureL2)) + "\n")
class DataGenerator: def __init__(self, outDir): sys.stderr.write("DataGenerator: Constructor\n") ## Languages and Order self.__LID = ["HI","EN"] self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map" self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map" ## Data containers self.__parL1 = [] self.__parL2 = [] self.__align = [] self.__pureL1 = [] self.__pureL2 = [] self.__outputDir = outDir self.__posMap = {} self.__phraseMap = dd(list) self.__csInstance = CSHandler() self.__utils = Utils() self.__Tree = Dependencytree() ## Generation Variants self.__csVariants = [0,1,2,3,4] self.__tagsetVariants = ["",".uni"] self.__dataRange = range(50,900,50) ##self.__dataRange = [200] self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)] self.__csHash = set() ##LID stuff self.__L1Tags = set() self.__L2Tags = set() self.__commonTags = set() ## Pre processing self.__genPosMap() self.__genPhraseMap() self.__csInstance.updatePhraseMap(self.__phraseMap) self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1]) ## Real test overwrites #self.__csVariants = [1,2,3,4] self.__tagsetVariants = [""] self.__dataRange = [400] self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)} #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]} #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]} #self.__splits = [(50,50)] #for i in range(0,51,5): # split = (100-i, i) # self.__splits.append(split) self.__fileSuffix = "" def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data): self.__parL1 = self.__utils.readSentences(l1Data) self.__parL2 = self.__utils.readSentences(l2Data) self.__align = self.__utils.readAligns(l1Aligns, l2Aligns) self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data) self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data) sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n") sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n") sys.stderr.write("align:"+str(len(self.__align))+"\n") sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n") sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n") def __genTrainData(self): statusCount = 0 for data in self.__dataRange: #control = 0 #while 1: for Split in self.__splits: #for control in range(3): #if control == 3: # break #pr= int(control*1.0/2 * data) pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for csType in self.__csVariants: self.__csHash = set() ##sys.stderr.write("csType:"+str(csType)+'\n') # Debugging !! #switch = "" ############# #for tag in self.__tagsetVariants: # Debugging !! #if switch == "yes": # break ################### #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n") sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w') ##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w') ##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w') ##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w') for index in pIndicesL1: line = self.__pureL1[index] #sys.stderr.write("L1 Line:"+str(line)+'\n') line = self.__addLangTags(line, self.__LID[0]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n') lineUni = self.__map2Uni(line) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')''' for index in pIndicesL2: line = self.__pureL2[index] #sys.stderr.write("L2 Line:"+str(line)+'\n') line = self.__addLangTags(line, self.__LID[1]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n') lineUni = self.__map2Uni(line) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')''' stopLength = tr+1 index = -1 while 1: index += 1 if index == len(self.__parL1): index = 0 csLine = "" order = stopLength%2 #sys.stderr.write("order:"+str(order)+'\n') self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) # Debugging !! #sys.stderr.write("Switch to another CS variant?? ") #switch = raw_input() #if switch == "yes": # break ############### csLine = csReturn[0] #csSequence = csReturn[1] #print csReturn[1] hashKey = (index, tuple(csReturn[1])) #print hashKey if csLine != -1 and hashKey not in self.__csHash: self.__csHash.add(hashKey) stopLength -= 1 else: continue #sys.stderr.write("csLine:"+str(csLine)+'\n') #csLine = self.__addLangTags(csLine) if stopLength <= 0: break dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n') ##Commented for real test ''' dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLine)))+'\n') csLineUni = self.__map2Uni(csLine) dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n') dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLineUni)))+'\n')''' dataFile.close() ##dataFileUni.close() ##dataFileUniq.close() ##dataFileUniUniq.close() if stopLength > 0: print tr, stopLength, "Training Break!!" #pr -= 1 dummy = raw_input() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() #pr += 1 print statusCount def __genTrainDataDiverse(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: for Split in self.__splits: pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) ##for csType in self.__csVariants: self.__csHash = set() sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w') #### Dangerous #### ##pIndicesL1 = [] ##pIndicesL2 = [] #### End of Dangerous #### for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n') stopLength = tr+1 index = -1 while 1: index += 1 if index == len(self.__parL1): break index = 0 csLine = "" order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] hashKey = (index, tuple(csReturn[1])) if csLine != -1 and hashKey not in self.__csHash: #self.__csHash.add(hashKey) self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n') dataFile.close() if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __genTrainDataDup(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: for splitIndex in range(len(self.__splits)): csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): break index = 0 csLines = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) if len(csLines) == 2: csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n') csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n') ##if splitIndex == 0: self.__Tree.updateTree(self.__parL1[index]) pureLine = self.__Tree.wordTags() pureLine = self.__addLangTags(pureLine, self.__LID[0]) pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n') self.__Tree.updateTree(self.__parL2[index]) pureLine = self.__Tree.wordTags() #print pureLine pureLine = self.__addLangTags(pureLine, self.__LID[1]) #print pureLine #sys.exit(0) pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n') self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def collectData(self): statusCount = 0 for csType in [4]: for data in self.__dataRanges[csType]: initialSlitCSData = [] for splitIndex in range(len(self.__splits)): if splitIndex > 0: return initialSlitCSData csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) initialSlitCSData.append((pureLine)) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) initialSlitCSData.append((pureLine)) if splitIndex != 0: random.seed() csSample = random.sample(initialSlitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) if splitIndex == 0: initialSlitCSData.append((self.__parL1[index],self.__parL2[index], self.__align[index])) self.__csHash.add(index) stopLength -= 1 ##else: ## l1Switch = (0, tuple(csSeqs[0])) ## l2Switch = (1, tuple(csSeqs[1])) ## self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __genFromSingleData(self): dataset = self.collectData() CSData = [d for d in dataset if len(d)==3] PUREData = [d[1] for d in dataset if len(d)==1] pureFile = open(self.__outputDir+"Baseline"+self.__fileSuffix,'w') pureFlag = 1 for csType in self.__csVariants: for splitIndex in range(len(self.__splits)): Split = self.__splits[splitIndex] dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(len(dataset))+self.__fileSuffix,'w') stopLength = len(CSData)/2 csData = [] pureData = [] index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(CSData[index][0], CSData[index][1], CSData[index][2], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) self.__csHash.add(index) stopLength -= 1 else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) if pureFlag: pureFlag = 0 for pureLine in pureData: pureFile.write(pureLine) for pureLine in PUREData: pureFile.write(pureLine) pureFile.close() dataFile.close() def __genTrainDataDupStrict(self): statusCount = 0 for csType in self.__csVariants: for data in self.__dataRanges[csType]: initialSlitCSData = [] for splitIndex in range(len(self.__splits)): csData = [] Split = self.__splits[splitIndex] pureData = [] pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w') dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w') pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data) tr = data - pr #pr = pr/2 print pr random.seed() pIndicesL1 = random.sample(range(len(self.__pureL1)),pr) pIndicesL2 = random.sample(range(len(self.__pureL2)),pr) for index in pIndicesL1: line = self.__pureL1[index] line = self.__addLangTags(line, self.__LID[0]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) for index in pIndicesL2: line = self.__pureL2[index] line = self.__addLangTags(line, self.__LID[1]) pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n' ##pureFile.write(pureLine) ##if splitIndex == 0: pureData.append(pureLine) csData.append(pureLine) if splitIndex != 0: random.seed() csSample = random.sample(initialSlitCSData, tr) for sample in csSample: csData.append(sample[0]) csData.append(sample[1]) pureData.append(sample[2]) pureData.append(sample[3]) else: ##for csType in self.__csVariants: self.__csHash = set() stopLength = tr index = -1 while 1: index += 1 if index == len(self.__parL1): ##break index = 0 print "Still:",stopLength," Looping.." csLines = [] csSeqs = [] for order in range(2): #order = stopLength%2 self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order) csReturn = self.__csInstance.csSentence(csType) csLine = csReturn[0] if csLine != -1: csLines.append(csLine) csSeqs.append(csReturn[1]) if len(csLines) == 2: csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]]) self.__Tree.updateTree(self.__parL1[index]) pureLine1 = self.__Tree.wordTags() pureLine1 = self.__addLangTags(pureLine1, self.__LID[0]) self.__Tree.updateTree(self.__parL2[index]) pureLine2 = self.__Tree.wordTags() pureLine2 = self.__addLangTags(pureLine2, self.__LID[1]) pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2]) if pureWords == csWords: p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n' p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n' pureData.append(p1) pureData.append(p2) cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n' csData.append(cs1) cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n' csData.append(cs2) if splitIndex == 0: initialSlitCSData.append((cs1,cs2, p1, p2)) self.__csHash.add(index) stopLength -= 1 ##else: ## l1Switch = (0, tuple(csSeqs[0])) ## l2Switch = (1, tuple(csSeqs[1])) ## self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch) else: continue if stopLength <= 0: break if stopLength > 0: print tr, stopLength, "Training Break!!" dummy = raw_input() for csLine in csData: dataFile.write(csLine) ##if splitIndex == 0: for pureLine in pureData: pureFile.write(pureLine) pureFile.close() dataFile.close() statusCount += 1 if statusCount%50 == 0: print statusCount, sys.stdout.flush() print statusCount def __addLangTags(self, WordTags, lTag): wordTags = [] for wt in WordTags: newWT = [i for i in wt] wordTags.append(newWT) for index in range(len(wordTags)): wordTags[index].append(lTag) return wordTags def __genPosMap(self): for i in open(self.__l1MapFile): i = i.strip() srcTag = i.split()[0] uniTag = i.split()[1] self.__posMap[srcTag] = uniTag for i in open(self.__l2MapFile): i = i.strip() srcTag = i.split()[0] uniTag = i.split()[1] self.__posMap[srcTag] = uniTag self.__L1Tags = set() for line in open(self.__l1MapFile): tag = line.split()[0] self.__L1Tags.add(tag) for line in open(self.__l2MapFile): tag = line.split()[0] self.__L2Tags.add(tag) self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags]) def __map2Uni(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] try: newLine[index][1] = self.__posMap[tag] except: newLine[index][1] = 'X' return newLine def __map2UniControl(self, wordTagsLangs): newLine = [] for index in range(len(wordTagsLangs)): newLine.append(wordTagsLangs[index]) tag = wordTagsLangs[index][1] lang = wordTagsLangs[index][2] try: newLine[index][1] = self.__posMap[tag]+'_'+lang except: newLine[index][1] = 'X'+'_'+lang return newLine def __genPhraseMap(self): phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping") for i in phraseMapFile: i = i.strip() self.__phraseMap[i.split()[0]].extend(i.split()[1].split(",")) def generateData(self): ##for i in range(10): ##self.__fileSuffix = "."+str(i) #self.__genTrainDataDiverse() self.__genTrainDataDupStrict()