Esempio n. 1
0
 def __init__(self):
   self.__something = 0
   self.__Alignments = Alignments()
Esempio n. 2
0
class Utils:
  def __init__(self):
    self.__something = 0
    self.__Alignments = Alignments()
  
  def readSentences(self, fileName):
    reader = open(fileName)
    sents = []
    while 1:
      sent = self.getNextSentence(reader)
      if len(sent) == 0:
        break
      sents.append(sent)
    return sents
  
  def readSentencesPlain(self, fileName):
    reader = open(fileName)
    sents = []
    while 1:
      sent = self.getNextSentencePlain(reader)
      if len(sent) == 0:
        break
      sents.append(sent)
    return sents
  
  def readAligns(self, l1Aligns, l2Aligns):
    l1Reader = open(l1Aligns)
    l2Reader = open(l2Aligns)
    aligns = []
    while 1:
      l1Align = self.getNextSentence(l1Reader)
      l2Align = self.getNextSentence(l2Reader)
      #print l1Alignprint l2Align
      if len(l1Align) == 0:
        break
      align = self.__Alignments.createAlignDict(l1Align, l2Align)
      aligns.append(align)
      ##break
    return aligns
  
  def getNextSentence(self, filePtr):
    sent = []
    flag = 0
    while 1:
      line = filePtr.readline()
      if not line:
        break
      if line.find("<Sentence") == 0:
        flag = 1
      if flag == 1:
        sent.append(line)
      if line.find("</Sentence") == 0:
        flag = 0
        break
    return sent
  
  def getNextSentencePlain(self, filePtr):
    sent = []
    while 1:
      line = filePtr.readline()
      if not line:
        break
      if line.strip() == "":
        return sent
      sent.append(line.strip().split('\t'))
    return sent
    
  def validSequences(self, length):
    sequences = []
    for i in range(length):
      tempSequence = []
      for j in range(i,length):
        tempSequence.append(j)
        newTemp = []
        for k in tempSequence:
            if k not in newTemp:
                newTemp.append(k)
        newTemp.sort()
        if len(newTemp)!=length:
          sequences.append(newTemp)
    return sequences
  
  def makeCSSentence(self, l1Sent, l1Sequence, l2Sent, align, L1, L2):
    #print l2Sent;#print sequence;#print align;#print l1Sent;#print sequence;
    #sys.stderr.write("l1Sent:"+str(l1Sent)+'\n')
    #sys.stderr.write("l2Sent:"+str(l2Sent)+'\n')
    csSent = []
    addedFlag = 0
    l2Sequence = self.l2Sequence(l1Sequence, align)
    #print l1Sequence, l2Sequence
    for i in range(len(l1Sent)):
        if i < l1Sequence[0] or i> l1Sequence[-1]:
            newWT = [p for p in l1Sent[i]]
            newWT.append(L1)
            csSent.append(newWT)
        elif addedFlag == 0:
            for j in l2Sequence:
                try:
                  #sys.stderr.write("j:"+str(j)+" ")
                  newWT = [p for p in l2Sent[j]]
                  newWT.append(L2)
                  csSent.append(newWT)
                except:
                  pass
            addedFlag = 1
            ##sys.stderr.write("\n")
    return csSent
  
  def wordTags(self, sentence):
    ## Note the delimiter. It's _# and not _ :P
    wordTags = []
    sentence = sentence.strip()
    spaceSplitSent = sentence.split()
    for i in range(len(spaceSplitSent)):
      if spaceSplitSent[i][-1] == ")":
        wordTags.append([spaceSplitSent[i].strip(")"), spaceSplitSent[i-1].strip("(")])
    return wordTags
  
  def l2Sequence(self, l1Sequence, align):
    l2Sequence = []
    for i in l1Sequence:
      if i in align.keys():  # What about else? Don't need to worry because the sequence in use is already checked for contiguity on L2 side!!
        for j in align[i]:
          if j not in l2Sequence:
            l2Sequence.append(j)
    l2Sequence.sort()
    return l2Sequence