def extractLexRec(srcFile, saveFile, RecordClass = record.MosesRecord): if type(srcFile) == str: srcFile = files.open(srcFile) if type(saveFile) == str: saveFile = files.open(saveFile, 'w') srcCount = defaultdict(lambda: 0) trgCount = defaultdict(lambda: 0) coCount = defaultdict(lambda: 0) for line in srcFile: rec = record.TravatarRecord(line) srcSymbols = rec.srcSymbols trgSymbols = rec.trgSymbols if len(srcSymbols) == 1 and len(trgSymbols) == 1: src = srcSymbols[0] trg = trgSymbols[0] srcCount[src] += rec.counts.co trgCount[trg] += rec.counts.co coCount[(src,trg)] += rec.counts.co for pair in sorted(coCount.keys()): (src,trg) = pair egfl = coCount[pair] / float(srcCount[src]) fgel = coCount[pair] / float(trgCount[trg]) buf = "%s %s %s %s\n" % (src, trg, egfl, fgel) saveFile.write(buf) saveFile.close()
def calcWordPairCountsByAligns(srcTextPath, trgTextPath, alignPath): srcTextFile = files.open(srcTextPath, 'r') trgTextFile = files.open(trgTextPath, 'r') alignFile = files.open(alignPath, 'r') pairCounter = PairCounter() while True: srcLine = srcTextFile.readline() trgLine = trgTextFile.readline() alignLine = alignFile.readline() if srcLine == "": break srcWords = srcLine.strip().split(' ') trgWords = trgLine.strip().split(' ') alignList = alignLine.strip().split(' ') # pairCounter.addNull() # for word in srcWords: # pairCounter.addSrc(word) # for word in trgWords: # pairCounter.addTrg(word) srcAlignedIndices = set() trgAlignedIndices = set() for align in alignList: (srcIndex, trgIndex) = map(int, align.split('-')) srcWord = srcWords[srcIndex] trgWord = trgWords[trgIndex] pairCounter.addPair(srcWord, trgWord) srcAlignedIndices.add( srcIndex ) trgAlignedIndices.add( trgIndex ) for i, srcWord in enumerate(srcWords): if not i in srcAlignedIndices: pairCounter.addPair(srcWord, "NULL") for i, trgWord in enumerate(trgWords): if not i in trgAlignedIndices: pairCounter.addPair("NULL", trgWord) return pairCounter
def __init__(self, table1, table2, index1, index2, RecordClass = MosesRecord): self.srcFile = files.open(table1, 'r') self.trgFile = files.open(table2, 'r') self.srcIndices = findutil.loadIndices(index1) self.trgIndices = findutil.loadIndices(index2) self.srcCount = progress.Counter(scaleup = 1000) self.rows = [] self.rowsCache = cache.Cache(size = CACHESIZE) self.Record = RecordClass
def calcLexWeights(tablePath, lexCounts, savePath, RecordClass = MosesRecord): tableFile = files.open(tablePath, 'r') saveFile = files.open(savePath, 'w') for line in tableFile: rec = RecordClass(line) if rec.trg.find('|COL|') < 0: rec.features['egfl'] = calcLexWeight(rec, lexCounts, reverse = False) rec.features['fgel'] = calcLexWeight(rec, lexCounts, reverse = True) saveFile.write( rec.toStr() ) else: rec.features['0egfl'] = calcLexWeight(rec, lexCounts, reverse = False) rec.features['0fgel'] = calcLexWeight(rec, lexCounts, reverse = True) saveFile.write( rec.toStr() ) saveFile.close() tableFile.close()
def reverseTable(srcFile, saveFile, RecordClass = record.MosesRecord): if type(srcFile) == str: srcFile = files.open(srcFile) if type(saveFile) == str: if files.getExt(saveFile) == '.gz': saveFile = open(saveFile, 'w') pipeGzip = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=saveFile) saveFile = pipeGzip.stdin else: saveFile = open(saveFile, 'w') gc.collect() env = os.environ.copy() env['LC_ALL'] = 'C' if PV: cmd = '%s -Wl -N "loaded lines" | sort | %s -Wl -N "sorted lines"' % (PV, PV) pipeSort = subprocess.Popen(cmd, env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True, shell=True) else: pipeSort = subprocess.Popen(['sort'], env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True) #inputSort = codecs.getwriter('utf-8')(pipeSort.stdin) inputSort = pipeSort.stdin for line in srcFile: rec = RecordClass(line) inputSort.write( rec.getReversed().toStr() ) pipeSort.stdin.close() pipeSort.communicate() saveFile.close()
def filterTable(srcFile, saveFile, rules, progress, RecordClass = record.MosesRecord): if type(srcFile) == str: srcFile = files.open(srcFile) if type(saveFile) == str: if files.getExt(saveFile) == '.gz': saveFile = open(saveFile, 'w') pipeGzip = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=saveFile) saveFile = pipeGzip.stdin else: saveFile = open(saveFile, 'w') pipePV = None if progress and PV: cmd = '%s -Wl -N "filtered lines"' % (PV) pipePV = subprocess.Popen(cmd, env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True, shell=True) output = pipePV.stdin else: output = saveFile for line in srcFile: rec = RecordClass(line) if matchRules(rec, rules): output.write( rec.toStr() ) if pipePV: pipePV.stdin.close() pipePV.communicate() saveFile.close()
def __init__(self, tablePath, RecordClass, **options): showProgress = options.get('showProgress', False) self.RecordClass = RecordClass self.tablePath = tablePath self.tableFile = files.open(tablePath, 'r') self.recordsSrcTrg = {} self.recordsTrgSrc = {} self.__load(showProgress)
def writeRecordQueue(workset): '''write the pivoted records in the queue into the table file''' pivotFile = files.open(workset.pivotPath, 'w') while True: rec = workset.outQueue.get() if rec == None: # if getting None, finish the loop break if rec.counts.co > 0: pivotFile.write( rec.toStr() ) pivotFile.close()
def loadWordPairCounts(lexPath): lexFile = files.open(lexPath, 'r') pairCounter = PairCounter() for line in lexFile: fields = line.split() srcWord = intern( fields[0] ) trgWord = intern( fields[1] ) # pairCounter.addPair(srcWord, trgWord, int(fields[2])) pairCounter.addPair(srcWord, trgWord, number.toNumber(fields[2])) # pairCounter.setSrc(srcWord, number.toNumber(fields[3])) # pairCounter.setTrg(trgWord, number.toNumber(fields[4])) return pairCounter
def saveWordPairCounts(savePath, pairCounter): saveFile = files.open(savePath, 'w') for pair in sorted(pairCounter.pairCounts.keys()): srcWord = pair[0] trgWord = pair[1] srcCount = number.toNumber(pairCounter.srcCounts[srcWord], MARGIN) trgCount = number.toNumber(pairCounter.trgCounts[trgWord], MARGIN) pairCount = number.toNumber(pairCounter.pairCounts[pair], MARGIN) if pairCount > 0: buf = "%s %s %s %s %s\n" % (srcWord, trgWord, pairCount, srcCount, trgCount) saveFile.write( buf ) saveFile.close()
def loadWordProbs(srcFile, reverse = False): if type(srcFile) == str: srcFile = files.open(srcFile) probs = {} for line in srcFile: fields = line.strip().split() src = fields[0] trg = fields[1] if not reverse: probs[(src, trg)] = float(fields[2]) else: probs[(trg, src)] = float(fields[3]) return probs
def calcPhraseTransProbsOnTable(tablePath, savePath, **options): '''calculate phrase trans probs on the table in which co-occurrence counts are estimated''' method = options.get('method', METHOD) RecordClass = options.get('RecordClass', MosesRecord) tableFile = files.open(tablePath, "r") saveFile = files.open(savePath, "w") records = {} lastSrc = '' for line in tableFile: rec = RecordClass(line) key = "%s ||| %s |||" % (rec.src, rec.trg) if rec.src != lastSrc and records: calcPhraseTransProbsByCounts(records) writeRecords(saveFile, records) records = {} if rec.counts.co > 0: records[key] = rec lastSrc = rec.src if records: calcPhraseTransProbsByCounts(records) writeRecords(saveFile, records) saveFile.close() tableFile.close()
def reverseTable(srcFile, saveFile, RecordClass=record.MosesRecord): if type(srcFile) == str: srcFile = files.open(srcFile) if type(saveFile) == str: if files.getExt(saveFile) == '.gz': saveFile = open(saveFile, 'w') pipeGzip = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=saveFile) saveFile = pipeGzip.stdin else: saveFile = open(saveFile, 'w') gc.collect() env = os.environ.copy() env['LC_ALL'] = 'C' if PV: cmd = '%s -Wl -N "loaded lines" | sort | %s -Wl -N "sorted lines"' % ( PV, PV) pipeSort = subprocess.Popen(cmd, env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True, shell=True) else: pipeSort = subprocess.Popen(['sort'], env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True) #inputSort = codecs.getwriter('utf-8')(pipeSort.stdin) inputSort = pipeSort.stdin for line in srcFile: rec = RecordClass(line) inputSort.write(rec.getReversed().toStr()) pipeSort.stdin.close() pipeSort.communicate() saveFile.close()
def filterTable(srcFile, saveFile, rules, progress, RecordClass=record.MosesRecord): if type(srcFile) == str: srcFile = files.open(srcFile) if type(saveFile) == str: if files.getExt(saveFile) == '.gz': saveFile = open(saveFile, 'w') pipeGzip = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=saveFile) saveFile = pipeGzip.stdin else: saveFile = open(saveFile, 'w') pipePV = None if progress and PV: cmd = '%s -Wl -N "filtered lines"' % (PV) pipePV = subprocess.Popen(cmd, env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True, shell=True) output = pipePV.stdin else: output = saveFile for line in srcFile: rec = RecordClass(line) if matchRules(rec, rules): output.write(rec.toStr()) if pipePV: pipePV.stdin.close() pipePV.communicate() saveFile.close()
def integrateTablePair(tablePath1, tablePath2, savePath, **options): RecordClass = options.get('RecordClass', MosesRecord) # method = options.get('method', 'count') recReader1 = RecordReader(tablePath1, **options) recReader2 = RecordReader(tablePath2, **options) saveFile = files.open(savePath, 'w') records1 = recReader1.getRecords() records2 = recReader2.getRecords() while True: if len(records1) == 0 and len(records2) == 0: break elif len(records1) == 0: triangulate.writeRecords(saveFile, records2) records2 = recReader2.getRecords() continue elif len(records2) == 0: triangulate.writeRecords(saveFile, records1) records1 = recReader1.getRecords() continue key1 = records1[0].src + ' |||' key2 = records2[0].src + ' |||' if key1 < key2: triangulate.writeRecords(saveFile, records1) records1 = recReader1.getRecords() elif key1 > key2: triangulate.writeRecords(saveFile, records2) records2 = recReader2.getRecords() else: # key1 == key2 merged = mergeRecords(records1, records2, **options) triangulate.writeRecords(saveFile, merged) records1 = recReader1.getRecords() records2 = recReader2.getRecords() saveFile.close()
def __init__(self, tablePath, **options): self.RecordClass = options.get('RecordClass', MosesRecord) self.tableFile = files.open(tablePath, 'r') self.records = []