def __init__(self): self.sentences = [] # [(word, tag, lang), ...] self.scoredSentences = [] # [(word, tag, lang, lmScore), ...] self.splittedSentences = [] # [([sentence], tag)] self.enTrigramModel = WBTrigramModel() self.hiTrigramModel = WBTrigramModel()
class SentenceSplitter: def __init__(self): self.sentences = [] # [(word, tag, lang), ...] self.scoredSentences = [] # [(word, tag, lang, lmScore), ...] self.splittedSentences = [] # [([sentence], tag)] self.enTrigramModel = WBTrigramModel() self.hiTrigramModel = WBTrigramModel() def trainLMsDefault(self): enCorpus = 'C:\Users\\t-phgad\Documents\Project\Data\\forLM/en-ICE-India.txt' hiCorpus = 'C:\Users\\t-phgad\Documents\Project\Data\\forLM/hi-TB.txt' self.enTrigramModel.loadSentences(enCorpus) self.hiTrigramModel.loadSentences(hiCorpus) def trainLMsWithOpts(self, enCorpus, hiCorpus, n): self.enTrigramModel.loadSentences(enCorpus) self.hiTrigramModel.loadSentences(hiCorpus) def loadSentencesSingleColCSV(self, sentencesCSV): csvLines = readlinesFromCSV(sentencesCSV) sent = [] for line in csvLines[1:]: if line[0] == u'': self.sentences.append(sent) sent = [] else: sent.append(tuple(line)) def loadSentences(self, sentences): self.sentences = [s for s in sentences] # Two versions of the function. Trigrams and the Unigrams def scoreSentence(self, sentIndex): sentence = self.sentences[sentIndex] context = [] newSentence = [] for index in range(len(sentence)): word, tag, lang = sentence[index] lm = self.hiTrigramModel if lang == 'E': lm = self.enTrigramModel lmScore = lm.scoreUnigram(word) if index - 2 >= 0: # and sentence[index - 2][2] == lang and sentence[index - 1][2] == lang: lmScore = lm.scoreTrigram((sentence[index - 2][0], sentence[index - 1][0], word)) elif index - 1 >= 0: # and sentence[index - 1][2] == lang: lmScore = lm.scoreBigram((sentence[index - 1][0], word)) if lmScore != 0: lmScore = -1.0 / log(lmScore) else: lmScore = -1.0 / log(0.000001) newSentence.append((word, tag, lang, lmScore)) context.append(word) return newSentence def scoreSentences(self): self.scoredSentences = [] for sentIndex in range(len(self.sentences)): self.scoredSentences.append(self.scoreSentence(sentIndex)) def sanityCheck(self): print len(self.scoredSentences) print self.scoredSentences[0] def analyzeSentences(self): for sentence in self.scoredSentences: print '\n'.join(map(lambda x:' '.join(map(lambda y:str(y), x)), sentence)) + '\n' dummy = raw_input() def splitSentences(self): index = 0 for sentence in self.scoredSentences: index += 1 origins = Counter(map(lambda x:x[2], sentence)) origin = 'H' if origins['H'] < origins['E']: origin = 'E' # for origin in ['E', 'H']: originedScores = [] for _, _, lang, score in sentence: if lang == origin: originedScores.append(score) else: originedScores.append(-score) start, end = longestPositive(originedScores) words = map(lambda x:x[0:2], sentence) # print str(index) + '\t' + origin + '\t' + ' '.join(words[0:start]) + "\t" + ' '.join(words[start:end]) + "\t" + ' '.join(words[end:len(words)]) otherOrigin = 'E' if otherOrigin == origin: otherOrigin = 'H' if start > 0: #print words[0:start] self.splittedSentences.append((words[0:start], otherOrigin, index, 0)) if end > start: self.splittedSentences.append((words[start:end], origin, index, start)) if len(words) > end: self.splittedSentences.append((words[end:len(words)], otherOrigin, index, end))