def run(self, timeout, num_newAlgn): numLines = len(self.corpus) if numLines > 2: nextRandomSize = balance(self.principal, 2, numLines - 1).next else: nextRandomSize = balance(self.principal, 1,numLines).next nssecnd = 0 nsubach = 0 nsubachsum = 0 prevdl = 0 prevasd = 0 lstwrt = startTime = time() speed = sys.maxint print >> sys.stderr, "\rWorking, please wait..." tmpFile = getTempFIle(".al") try: try: while speed > num_newAlgn: t = time() if timeout is not None and t - startTime >= timeout: break elapsedTime = t - lstwrt if nsubach >= 1 and elapsedTime >= 1: speed = int(math.ceil((self.numAligns - prevdl) / elapsedTime)) prevdl = self.numAligns lstwrt = t subcorpusSize = nextRandomSize() while subcorpusSize > maximumSize: subcorpusSize = nextRandomSize() if subcorpusSize == 2: nssecnd += 1 nsubach += 1 nsubachsum += subcorpusSize self.alignwords(random.sample(range(numLines), subcorpusSize), tmpFile) except KeyboardInterrupt: pass if numLines > 2: wnum1 = 2 * nssecnd * math.log(1 - 2. / (numLines + 1)) / (numLines * math.log(1 - 1. / (numLines + 1))) wnum2 = 2 * nssecnd * math.log(1 - 2. / (numLines + 1)) / (numLines * math.log(1 - 1. * numLines / (numLines + 1))) if wnum1: frac1, wnum1 = math.modf(wnum1) wnum1 = int(wnum1) for i in range(numLines): wei = wnum1 if random.random() < frac1: wei += 1 if wei: self.alignwords([i], tmpFile, wei) if wnum2: fracN, wnum2 = math.modf(wnum2) wei = int(wnum2) if random.random() < fracN: wei += 1 if wei: self.alignwords(range(numLines), tmpFile, wei) tmpFile.seek(0) self._weights(tmpFile) finally: tmpFile.close() print >> sys.stderr, "\rDone, please check your output file."
def __init__(self, inputFilenames): self.maxNbLines =0 self.time = 100 self.archivos = inputFilenames self.numNewAligns = -1 self.discontiguousFields ='' self.minSize =1 self.maxSize=7 self.dlmn=None self.indexer =1 self.writer = HTMLOutput(sys.stdout, 'utf-8', None) self.counter = {} self.numAligns = 0 self.AlignedFile = getTempFIle(".al_lw") self.offsets = [] numLines = None self.numLang = 0 try: self.files =[openFile(file) for file in inputFilenames] for file in self.files: offset = 0 fileOffsets = [] fileLanguages = None lineNumber = -1 for lineNumber, line in enumerate(file): fileLine = line.count('\t') + 1 if fileLanguages is None: fileLanguages = fileLine self.numLang += fileLine else: assert fileLine == fileLanguages, "There is %i columns " \ " instead of %i at line %i in file %s" % \ (fileLine, fileLanguages, lineNumber + 1, file.name) fileOffsets.append(offset) offset += len(line) if numLines is None: numLines = lineNumber + 1 else: assert numLines == lineNumber + 1, \ "Input files have different number of lines" self.offsets.append(getBestArray(fileOffsets)) del fileOffsets self.minLanguages = self.numLang narft = changeFields(self.discontiguousFields, self.numLang) self.contiguousFields = [(i + 1 not in narft) for i in range(self.numLang)] if self.maxNbLines < 1: numCorpus = 1 else: numCorpus = int(math.ceil(1. * numLines / self.maxNbLines)) self.time /= 1. * numCorpus lines = range(numLines) random.shuffle(lines) for numCurpTres in range(numCorpus, 0, -1): select = [lines.pop() for _ in range(int(math.ceil(1. * len(lines) / numCurpTres)))] select.sort() self.prepareCorpus(select) self.run(self.time, self.numNewAligns) setProbability(self.AlignedFile, self.counter, self.writer) finally: self.AlignedFile.close() for file in self.files: file.close()