コード例 #1
0
    def prepareCorpus(self, lines):

        self.corpus = [[] for _ in lines]
        self.words, self.wordLanguages = [], []
        wIds = [{} for _ in range(self.numLang)]
        numLangFlag = 0
        for file, fileOff in zip(self.files, self.offsets):
            for lineId, offsetId in enumerate(lines):
                file.seek(fileOff[offsetId])
                line = self.corpus[lineId]
                for i, sentence in enumerate(file.readline().split('\t')):
                    languageId = i + numLangFlag
                    wordIds = wIds[languageId]
                    for word in sentence.split():
                        wordId = wordIds.get(word)
                        if wordId is None:
                            wordId = len(self.words)
                            wordIds[word] = wordId
                            self.words.append(word)
                            self.wordLanguages.append(languageId)
                        line.append(wordId)
            numLangFlag = languageId + 1

        self.wFr = [0] * len(self.words)

        for line in self.corpus:
            for wordId in set(line):
                self.wFr[wordId] += 1

        self.words.append(self.dlmn)
        self.wordLanguages.append(self.numLang)
        self.wFr.append(max(self.wFr) + 1)

        srtfrqw = sorted(range(len(self.words)), key=self.wFr.__getitem__, reverse=True)
        self.words = [self.words[i] for i in srtfrqw]
        self.wordLanguages = getBestArray([self.wordLanguages[i] for i in srtfrqw], self.numLang)
        new_Pos = [None] * len(self.words)
        for i, wordId in enumerate(srtfrqw):
            new_Pos[wordId] = i
        for i, line in enumerate(self.corpus): # Replace word ids in corpus
            self.corpus[i] = [new_Pos[wordId] for wordId in line]

        self.wFr.sort(reverse=True)
        self.wFr = getBestArray(self.wFr)
        ngramgrw = range(2, self.indexer + 1)
        languagegrw = range(self.numLang)
        allNgramIds = [{} for _ in ngramgrw]
        self.allNgrams = [[] for _ in ngramgrw]
        self.ngramCorpora = [[] for _ in ngramgrw]

        for line in self.corpus:
            sentences = [[] for _ in languagegrw]
            ngramSentences = [set() for _ in ngramgrw]
            for word in line:
                sentences[self.wordLanguages[word]].append(word)
            for s in sentences:
                s = tuple(s)
                lastIdx = len(s) + 1
                for one in range(2, min(self.indexer+1, lastIdx)):
                    ngids = allNgramIds[one-2]
                    ngs = self.allNgrams[one-2]
                    ngsen = ngramSentences[one-2]
                    for i in range(lastIdx - one):
                        ng = s[i:i+one]
                        ngid = ngids.get(ng)
                        if ngid is None:
                            ngid = len(ngs)
                            ngids[ng] = ngid
                            ngs.append(ng)
                        ngsen.add(ngid)
            for one in ngramgrw:
                self.ngramCorpora[one-2].append(sorted(ngramSentences[one-2]))
コード例 #2
0
    def __init__(self, inputFilenames):

        self.maxNbLines =0
        self.time = 100
        self.archivos = inputFilenames
        self.numNewAligns = -1
        self.discontiguousFields =''
        self.minSize =1
        self.maxSize=7
        self.dlmn=None
        self.indexer =1
        self.writer = HTMLOutput(sys.stdout, 'utf-8', None)
        self.counter = {}
        self.numAligns = 0
        self.AlignedFile = getTempFIle(".al_lw")
        self.offsets = []
        numLines = None
        self.numLang = 0

        try:
            self.files =[openFile(file) for file in inputFilenames]
            for file in self.files:
                offset = 0
                fileOffsets = []
                fileLanguages = None
                lineNumber = -1
                for lineNumber, line in enumerate(file):
                    fileLine = line.count('\t') + 1
                    if fileLanguages is None:
                        fileLanguages = fileLine
                        self.numLang += fileLine
                    else:
                        assert fileLine == fileLanguages, "There is %i columns " \
                               " instead of %i at line %i in file %s" % \
                               (fileLine, fileLanguages, lineNumber + 1, file.name)
                    fileOffsets.append(offset)
                    offset += len(line)
                if numLines is None:
                    numLines = lineNumber + 1
                else:
                    assert numLines == lineNumber + 1, \
                           "Input files have different number of lines"
                self.offsets.append(getBestArray(fileOffsets))
                del fileOffsets
            self.minLanguages = self.numLang
            narft = changeFields(self.discontiguousFields, self.numLang)
            self.contiguousFields = [(i + 1 not in narft) for i in range(self.numLang)]
            if self.maxNbLines < 1:
                numCorpus = 1
            else:
                numCorpus = int(math.ceil(1. * numLines / self.maxNbLines))
                self.time /= 1. * numCorpus
            lines = range(numLines)
            random.shuffle(lines)
            for numCurpTres in range(numCorpus, 0, -1):
                select = [lines.pop() for _ in range(int(math.ceil(1. * len(lines) / numCurpTres)))]
                select.sort()
                self.prepareCorpus(select)
                self.run(self.time, self.numNewAligns)
            setProbability(self.AlignedFile, self.counter, self.writer)
        finally:
            self.AlignedFile.close()
            for file in self.files:
                file.close()