def prepareCorpus(self, lines): self.corpus = [[] for _ in lines] self.words, self.wordLanguages = [], [] wIds = [{} for _ in range(self.numLang)] numLangFlag = 0 for file, fileOff in zip(self.files, self.offsets): for lineId, offsetId in enumerate(lines): file.seek(fileOff[offsetId]) line = self.corpus[lineId] for i, sentence in enumerate(file.readline().split('\t')): languageId = i + numLangFlag wordIds = wIds[languageId] for word in sentence.split(): wordId = wordIds.get(word) if wordId is None: wordId = len(self.words) wordIds[word] = wordId self.words.append(word) self.wordLanguages.append(languageId) line.append(wordId) numLangFlag = languageId + 1 self.wFr = [0] * len(self.words) for line in self.corpus: for wordId in set(line): self.wFr[wordId] += 1 self.words.append(self.dlmn) self.wordLanguages.append(self.numLang) self.wFr.append(max(self.wFr) + 1) srtfrqw = sorted(range(len(self.words)), key=self.wFr.__getitem__, reverse=True) self.words = [self.words[i] for i in srtfrqw] self.wordLanguages = getBestArray([self.wordLanguages[i] for i in srtfrqw], self.numLang) new_Pos = [None] * len(self.words) for i, wordId in enumerate(srtfrqw): new_Pos[wordId] = i for i, line in enumerate(self.corpus): # Replace word ids in corpus self.corpus[i] = [new_Pos[wordId] for wordId in line] self.wFr.sort(reverse=True) self.wFr = getBestArray(self.wFr) ngramgrw = range(2, self.indexer + 1) languagegrw = range(self.numLang) allNgramIds = [{} for _ in ngramgrw] self.allNgrams = [[] for _ in ngramgrw] self.ngramCorpora = [[] for _ in ngramgrw] for line in self.corpus: sentences = [[] for _ in languagegrw] ngramSentences = [set() for _ in ngramgrw] for word in line: sentences[self.wordLanguages[word]].append(word) for s in sentences: s = tuple(s) lastIdx = len(s) + 1 for one in range(2, min(self.indexer+1, lastIdx)): ngids = allNgramIds[one-2] ngs = self.allNgrams[one-2] ngsen = ngramSentences[one-2] for i in range(lastIdx - one): ng = s[i:i+one] ngid = ngids.get(ng) if ngid is None: ngid = len(ngs) ngids[ng] = ngid ngs.append(ng) ngsen.add(ngid) for one in ngramgrw: self.ngramCorpora[one-2].append(sorted(ngramSentences[one-2]))
def __init__(self, inputFilenames): self.maxNbLines =0 self.time = 100 self.archivos = inputFilenames self.numNewAligns = -1 self.discontiguousFields ='' self.minSize =1 self.maxSize=7 self.dlmn=None self.indexer =1 self.writer = HTMLOutput(sys.stdout, 'utf-8', None) self.counter = {} self.numAligns = 0 self.AlignedFile = getTempFIle(".al_lw") self.offsets = [] numLines = None self.numLang = 0 try: self.files =[openFile(file) for file in inputFilenames] for file in self.files: offset = 0 fileOffsets = [] fileLanguages = None lineNumber = -1 for lineNumber, line in enumerate(file): fileLine = line.count('\t') + 1 if fileLanguages is None: fileLanguages = fileLine self.numLang += fileLine else: assert fileLine == fileLanguages, "There is %i columns " \ " instead of %i at line %i in file %s" % \ (fileLine, fileLanguages, lineNumber + 1, file.name) fileOffsets.append(offset) offset += len(line) if numLines is None: numLines = lineNumber + 1 else: assert numLines == lineNumber + 1, \ "Input files have different number of lines" self.offsets.append(getBestArray(fileOffsets)) del fileOffsets self.minLanguages = self.numLang narft = changeFields(self.discontiguousFields, self.numLang) self.contiguousFields = [(i + 1 not in narft) for i in range(self.numLang)] if self.maxNbLines < 1: numCorpus = 1 else: numCorpus = int(math.ceil(1. * numLines / self.maxNbLines)) self.time /= 1. * numCorpus lines = range(numLines) random.shuffle(lines) for numCurpTres in range(numCorpus, 0, -1): select = [lines.pop() for _ in range(int(math.ceil(1. * len(lines) / numCurpTres)))] select.sort() self.prepareCorpus(select) self.run(self.time, self.numNewAligns) setProbability(self.AlignedFile, self.counter, self.writer) finally: self.AlignedFile.close() for file in self.files: file.close()