コード例 #1
0
ファイル: lex.py プロジェクト: akivajp/explib-python
def extractLexRec(srcFile, saveFile, RecordClass = record.MosesRecord):
    if type(srcFile) == str:
        srcFile = files.open(srcFile)
    if type(saveFile) == str:
        saveFile = files.open(saveFile, 'w')
    srcCount = defaultdict(lambda: 0)
    trgCount = defaultdict(lambda: 0)
    coCount  = defaultdict(lambda: 0)
    for line in srcFile:
        rec = record.TravatarRecord(line)
        srcSymbols = rec.srcSymbols
        trgSymbols = rec.trgSymbols
        if len(srcSymbols) == 1 and len(trgSymbols) == 1:
            src = srcSymbols[0]
            trg = trgSymbols[0]
            srcCount[src] += rec.counts.co
            trgCount[trg] += rec.counts.co
            coCount[(src,trg)] += rec.counts.co
    for pair in sorted(coCount.keys()):
        (src,trg) = pair
        egfl = coCount[pair] / float(srcCount[src])
        fgel = coCount[pair] / float(trgCount[trg])
        buf = "%s %s %s %s\n" % (src, trg, egfl, fgel)
        saveFile.write(buf)
    saveFile.close()
コード例 #2
0
ファイル: lex.py プロジェクト: akivajp/explib-python
def calcWordPairCountsByAligns(srcTextPath, trgTextPath, alignPath):
    srcTextFile = files.open(srcTextPath, 'r')
    trgTextFile = files.open(trgTextPath, 'r')
    alignFile = files.open(alignPath, 'r')
    pairCounter = PairCounter()
    while True:
        srcLine = srcTextFile.readline()
        trgLine = trgTextFile.readline()
        alignLine = alignFile.readline()
        if srcLine == "":
            break
        srcWords = srcLine.strip().split(' ')
        trgWords = trgLine.strip().split(' ')
        alignList = alignLine.strip().split(' ')
#        pairCounter.addNull()
#        for word in srcWords:
#          pairCounter.addSrc(word)
#        for word in trgWords:
#          pairCounter.addTrg(word)
        srcAlignedIndices = set()
        trgAlignedIndices = set()
        for align in alignList:
            (srcIndex, trgIndex) = map(int, align.split('-'))
            srcWord = srcWords[srcIndex]
            trgWord = trgWords[trgIndex]
            pairCounter.addPair(srcWord, trgWord)
            srcAlignedIndices.add( srcIndex )
            trgAlignedIndices.add( trgIndex )
        for i, srcWord in enumerate(srcWords):
            if not i in srcAlignedIndices:
                pairCounter.addPair(srcWord, "NULL")
        for i, trgWord in enumerate(trgWords):
            if not i in trgAlignedIndices:
                pairCounter.addPair("NULL", trgWord)
    return pairCounter
コード例 #3
0
ファイル: triangulate.py プロジェクト: akivajp/acl2015
 def __init__(self, table1, table2, index1, index2, RecordClass = MosesRecord):
     self.srcFile = files.open(table1, 'r')
     self.trgFile = files.open(table2, 'r')
     self.srcIndices = findutil.loadIndices(index1)
     self.trgIndices = findutil.loadIndices(index2)
     self.srcCount = progress.Counter(scaleup = 1000)
     self.rows = []
     self.rowsCache = cache.Cache(size = CACHESIZE)
     self.Record = RecordClass
コード例 #4
0
ファイル: triangulate.py プロジェクト: akivajp/acl2015
def calcLexWeights(tablePath, lexCounts, savePath, RecordClass = MosesRecord):
    tableFile = files.open(tablePath, 'r')
    saveFile  = files.open(savePath, 'w')
    for line in tableFile:
        rec = RecordClass(line)
        if rec.trg.find('|COL|') < 0:
            rec.features['egfl'] = calcLexWeight(rec, lexCounts, reverse = False)
            rec.features['fgel'] = calcLexWeight(rec, lexCounts, reverse = True)
            saveFile.write( rec.toStr() )
        else:
            rec.features['0egfl'] = calcLexWeight(rec, lexCounts, reverse = False)
            rec.features['0fgel'] = calcLexWeight(rec, lexCounts, reverse = True)
            saveFile.write( rec.toStr() )
    saveFile.close()
    tableFile.close()
コード例 #5
0
ファイル: reverse.py プロジェクト: akivajp/acl2015
def reverseTable(srcFile, saveFile, RecordClass = record.MosesRecord):
    if type(srcFile) == str:
        srcFile = files.open(srcFile)
    if type(saveFile) == str:
        if files.getExt(saveFile) == '.gz':
            saveFile = open(saveFile, 'w')
            pipeGzip = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=saveFile)
            saveFile = pipeGzip.stdin
        else:
            saveFile = open(saveFile, 'w')
    gc.collect()
    env = os.environ.copy()
    env['LC_ALL'] = 'C'
    if PV:
        cmd = '%s -Wl -N "loaded lines" | sort | %s -Wl -N "sorted lines"' % (PV, PV)
        pipeSort = subprocess.Popen(cmd, env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True, shell=True)
    else:
        pipeSort = subprocess.Popen(['sort'], env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True)
    #inputSort = codecs.getwriter('utf-8')(pipeSort.stdin)
    inputSort = pipeSort.stdin
    for line in srcFile:
        rec = RecordClass(line)
        inputSort.write( rec.getReversed().toStr() )
    pipeSort.stdin.close()
    pipeSort.communicate()
    saveFile.close()
コード例 #6
0
ファイル: filter.py プロジェクト: akivajp/acl2015
def filterTable(srcFile, saveFile, rules, progress, RecordClass = record.MosesRecord):
    if type(srcFile) == str:
      srcFile = files.open(srcFile)
    if type(saveFile) == str:
      if files.getExt(saveFile) == '.gz':
        saveFile = open(saveFile, 'w')
        pipeGzip = subprocess.Popen(['gzip'], stdin=subprocess.PIPE, stdout=saveFile)
        saveFile = pipeGzip.stdin
      else:
        saveFile = open(saveFile, 'w')
    pipePV = None
    if progress and PV:
        cmd = '%s -Wl -N "filtered lines"' % (PV)
        pipePV = subprocess.Popen(cmd, env=env, stdin=subprocess.PIPE, stdout=saveFile, close_fds=True, shell=True)
        output = pipePV.stdin
    else:
        output = saveFile
    for line in srcFile:
      rec = RecordClass(line)
      if matchRules(rec, rules):
          output.write( rec.toStr() )
    if pipePV:
        pipePV.stdin.close()
        pipePV.communicate()
    saveFile.close()
コード例 #7
0
ファイル: table.py プロジェクト: akivajp/explib-python
 def __init__(self, tablePath, RecordClass, **options):
     showProgress = options.get('showProgress', False)
     self.RecordClass = RecordClass
     self.tablePath = tablePath
     self.tableFile = files.open(tablePath, 'r')
     self.recordsSrcTrg = {}
     self.recordsTrgSrc = {}
     self.__load(showProgress)
コード例 #8
0
ファイル: table.py プロジェクト: akivajp/acl2015
 def __init__(self, tablePath, RecordClass, **options):
     showProgress = options.get('showProgress', False)
     self.RecordClass = RecordClass
     self.tablePath = tablePath
     self.tableFile = files.open(tablePath, 'r')
     self.recordsSrcTrg = {}
     self.recordsTrgSrc = {}
     self.__load(showProgress)
コード例 #9
0
ファイル: triangulate.py プロジェクト: akivajp/acl2015
def writeRecordQueue(workset):
    '''write the pivoted records in the queue into the table file'''
    pivotFile = files.open(workset.pivotPath, 'w')
    while True:
        rec = workset.outQueue.get()
        if rec == None:
            # if getting None, finish the loop
            break
        if rec.counts.co > 0:
            pivotFile.write( rec.toStr() )
    pivotFile.close()
コード例 #10
0
ファイル: lex.py プロジェクト: akivajp/explib-python
def loadWordPairCounts(lexPath):
    lexFile = files.open(lexPath, 'r')
    pairCounter = PairCounter()
    for line in lexFile:
      fields = line.split()
      srcWord = intern( fields[0] )
      trgWord = intern( fields[1] )
#      pairCounter.addPair(srcWord, trgWord, int(fields[2]))
      pairCounter.addPair(srcWord, trgWord, number.toNumber(fields[2]))
#      pairCounter.setSrc(srcWord, number.toNumber(fields[3]))
#      pairCounter.setTrg(trgWord, number.toNumber(fields[4]))
    return pairCounter
コード例 #11
0
ファイル: lex.py プロジェクト: akivajp/explib-python
def saveWordPairCounts(savePath, pairCounter):
    saveFile = files.open(savePath, 'w')
    for pair in sorted(pairCounter.pairCounts.keys()):
        srcWord = pair[0]
        trgWord = pair[1]
        srcCount = number.toNumber(pairCounter.srcCounts[srcWord], MARGIN)
        trgCount = number.toNumber(pairCounter.trgCounts[trgWord], MARGIN)
        pairCount = number.toNumber(pairCounter.pairCounts[pair], MARGIN)
        if pairCount > 0:
            buf = "%s %s %s %s %s\n" % (srcWord, trgWord, pairCount, srcCount, trgCount)
        saveFile.write( buf )
    saveFile.close()
コード例 #12
0
ファイル: lex.py プロジェクト: akivajp/explib-python
def loadWordProbs(srcFile, reverse = False):
    if type(srcFile) == str:
        srcFile = files.open(srcFile)
    probs = {}
    for line in srcFile:
        fields = line.strip().split()
        src = fields[0]
        trg = fields[1]
        if not reverse:
            probs[(src, trg)] = float(fields[2])
        else:
            probs[(trg, src)] = float(fields[3])
    return probs
コード例 #13
0
ファイル: triangulate.py プロジェクト: akivajp/acl2015
def calcPhraseTransProbsOnTable(tablePath, savePath, **options):
    '''calculate phrase trans probs on the table in which co-occurrence counts are estimated'''
    method = options.get('method', METHOD)
    RecordClass = options.get('RecordClass', MosesRecord)

    tableFile = files.open(tablePath, "r")
    saveFile  = files.open(savePath, "w")
    records = {}
    lastSrc = ''
    for line in tableFile:
        rec = RecordClass(line)
        key = "%s ||| %s |||" % (rec.src, rec.trg)
        if rec.src != lastSrc and records:
            calcPhraseTransProbsByCounts(records)
            writeRecords(saveFile, records)
            records = {}
        if rec.counts.co > 0:
            records[key] = rec
        lastSrc = rec.src
    if records:
        calcPhraseTransProbsByCounts(records)
        writeRecords(saveFile, records)
    saveFile.close()
    tableFile.close()
コード例 #14
0
ファイル: reverse.py プロジェクト: akivajp/explib-python
def reverseTable(srcFile, saveFile, RecordClass=record.MosesRecord):
    if type(srcFile) == str:
        srcFile = files.open(srcFile)
    if type(saveFile) == str:
        if files.getExt(saveFile) == '.gz':
            saveFile = open(saveFile, 'w')
            pipeGzip = subprocess.Popen(['gzip'],
                                        stdin=subprocess.PIPE,
                                        stdout=saveFile)
            saveFile = pipeGzip.stdin
        else:
            saveFile = open(saveFile, 'w')
    gc.collect()
    env = os.environ.copy()
    env['LC_ALL'] = 'C'
    if PV:
        cmd = '%s -Wl -N "loaded lines" | sort | %s -Wl -N "sorted lines"' % (
            PV, PV)
        pipeSort = subprocess.Popen(cmd,
                                    env=env,
                                    stdin=subprocess.PIPE,
                                    stdout=saveFile,
                                    close_fds=True,
                                    shell=True)
    else:
        pipeSort = subprocess.Popen(['sort'],
                                    env=env,
                                    stdin=subprocess.PIPE,
                                    stdout=saveFile,
                                    close_fds=True)
    #inputSort = codecs.getwriter('utf-8')(pipeSort.stdin)
    inputSort = pipeSort.stdin
    for line in srcFile:
        rec = RecordClass(line)
        inputSort.write(rec.getReversed().toStr())
    pipeSort.stdin.close()
    pipeSort.communicate()
    saveFile.close()
コード例 #15
0
def filterTable(srcFile,
                saveFile,
                rules,
                progress,
                RecordClass=record.MosesRecord):
    if type(srcFile) == str:
        srcFile = files.open(srcFile)
    if type(saveFile) == str:
        if files.getExt(saveFile) == '.gz':
            saveFile = open(saveFile, 'w')
            pipeGzip = subprocess.Popen(['gzip'],
                                        stdin=subprocess.PIPE,
                                        stdout=saveFile)
            saveFile = pipeGzip.stdin
        else:
            saveFile = open(saveFile, 'w')
    pipePV = None
    if progress and PV:
        cmd = '%s -Wl -N "filtered lines"' % (PV)
        pipePV = subprocess.Popen(cmd,
                                  env=env,
                                  stdin=subprocess.PIPE,
                                  stdout=saveFile,
                                  close_fds=True,
                                  shell=True)
        output = pipePV.stdin
    else:
        output = saveFile
    for line in srcFile:
        rec = RecordClass(line)
        if matchRules(rec, rules):
            output.write(rec.toStr())
    if pipePV:
        pipePV.stdin.close()
        pipePV.communicate()
    saveFile.close()
コード例 #16
0
ファイル: integrate.py プロジェクト: akivajp/explib-python
def integrateTablePair(tablePath1, tablePath2, savePath, **options):
    RecordClass = options.get('RecordClass', MosesRecord)
    #    method = options.get('method', 'count')

    recReader1 = RecordReader(tablePath1, **options)
    recReader2 = RecordReader(tablePath2, **options)
    saveFile = files.open(savePath, 'w')

    records1 = recReader1.getRecords()
    records2 = recReader2.getRecords()
    while True:
        if len(records1) == 0 and len(records2) == 0:
            break
        elif len(records1) == 0:
            triangulate.writeRecords(saveFile, records2)
            records2 = recReader2.getRecords()
            continue
        elif len(records2) == 0:
            triangulate.writeRecords(saveFile, records1)
            records1 = recReader1.getRecords()
            continue

        key1 = records1[0].src + ' |||'
        key2 = records2[0].src + ' |||'
        if key1 < key2:
            triangulate.writeRecords(saveFile, records1)
            records1 = recReader1.getRecords()
        elif key1 > key2:
            triangulate.writeRecords(saveFile, records2)
            records2 = recReader2.getRecords()
        else:  # key1 == key2
            merged = mergeRecords(records1, records2, **options)
            triangulate.writeRecords(saveFile, merged)
            records1 = recReader1.getRecords()
            records2 = recReader2.getRecords()
    saveFile.close()
コード例 #17
0
ファイル: integrate.py プロジェクト: akivajp/acl2015
def integrateTablePair(tablePath1, tablePath2, savePath, **options):
    RecordClass = options.get('RecordClass', MosesRecord)
#    method = options.get('method', 'count')

    recReader1 = RecordReader(tablePath1, **options)
    recReader2 = RecordReader(tablePath2, **options)
    saveFile = files.open(savePath, 'w')

    records1 = recReader1.getRecords()
    records2 = recReader2.getRecords()
    while True:
        if len(records1) == 0 and len(records2) == 0:
            break
        elif len(records1) == 0:
            triangulate.writeRecords(saveFile, records2)
            records2 = recReader2.getRecords()
            continue
        elif len(records2) == 0:
            triangulate.writeRecords(saveFile, records1)
            records1 = recReader1.getRecords()
            continue

        key1 = records1[0].src + ' |||'
        key2 = records2[0].src + ' |||'
        if key1 < key2:
            triangulate.writeRecords(saveFile, records1)
            records1 = recReader1.getRecords()
        elif key1 > key2:
            triangulate.writeRecords(saveFile, records2)
            records2 = recReader2.getRecords()
        else: # key1 == key2
            merged = mergeRecords(records1, records2, **options)
            triangulate.writeRecords(saveFile, merged)
            records1 = recReader1.getRecords()
            records2 = recReader2.getRecords()
    saveFile.close()
コード例 #18
0
ファイル: record.py プロジェクト: akivajp/explib-python
 def __init__(self, tablePath, **options):
     self.RecordClass = options.get('RecordClass', MosesRecord)
     self.tableFile = files.open(tablePath, 'r')
     self.records = []