Esempio n. 1
0
def load_stanford_pcfg_parse(utt, parse, comma_is_pause=False):
    if utt.words == None:
        raise SiReError(
            "No words in utterance! Please load an mlf or txt file first!")
    tree = parsetrees.stanfordPcfgTree()
    tree.make_tree(parse)
    if comma_is_pause == True:
        leafs = tree.get_leafs(include_punct=[","])
    else:
        leafs = tree.get_leafs()
    num_w = utt.num_words_no_pau(comma_is_pause)
    if len(leafs) != num_w:
        #First we try to see if this is due to differences in how words are
        #dealt with in parsing and annotation.
        #Prime example is using 's in e.g. there's for transcription instead of there is.
        #Parsing splits there's into two whereas in e.g. combilex there's is one word.
        #If this is the case we split the WORD into two with the 's being a single phoneme
        #single syllable word. In other cases the contraction straddles two words and
        #we add a "phony" word which affects contexts but adds no phonemes.
        utterance_utils.try_split_words(utt)
        #Update num_w
        num_w = utt.num_words_no_pau(comma_is_pause)
        if len(leafs) != num_w:
            for w in utt.words:
                print w.id
            raise SiReError(
                "Number of leaves ({0}) not equal to number of words ({1})! In utt ({2})!"
                .format(len(leafs), num_w, utt.id))
    #Match each word with parse
    words = utt.get_words_no_pau(comma_is_pause)
    for i, word in enumerate(words):
        l = leafs[i].label.split("-")
        word.id = l[1]
        word.pos = l[0]
        #There should always be a parent
        word.parent_phrase = leafs[i].parent
        #But there might not be more than one
        if word.parent_phrase.parent != None:
            word.grandparent_phrase = word.parent_phrase.parent
        else:
            word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse()
        #And certainly we might be done here
        if word.grandparent_phrase.parent in [
                None, "xx"
        ] or word.grandparent_phrase.parent.label == "xx":
            word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse(
            )
        else:
            word.greatgrandparent_phrase = word.grandparent_phrase.parent

    #Now add fake parse for sil, pau and #
    for word in utt.words:
        if word.id in utt.phoneme_features.get_sil_phonemes():
            word.parent_phrase = parsetrees.get_fake_stanford_pcfg_parse()
            word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse()
            word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse(
            )
            word.pos = "sil"
Esempio n. 2
0
def load_stanford_pcfg_parse(utt, parse, comma_is_pause=False):
  if utt.words == None:
    raise SiReError("No words in utterance! Please load an mlf or txt file first!")
  tree = parsetrees.stanfordPcfgTree()
  tree.make_tree(parse)
  if comma_is_pause == True:
    leafs = tree.get_leafs(include_punct=[","])
  else:
    leafs = tree.get_leafs()
  num_w = utt.num_words_no_pau(comma_is_pause)
  if len(leafs) != num_w:
    #First we try to see if this is due to differences in how words are
    #dealt with in parsing and annotation. 
    #Prime example is using 's in e.g. there's for transcription instead of there is.
    #Parsing splits there's into two whereas in e.g. combilex there's is one word.
    #If this is the case we split the WORD into two with the 's being a single phoneme
    #single syllable word. In other cases the contraction straddles two words and
    #we add a "phony" word which affects contexts but adds no phonemes.
    utterance_utils.try_split_words(utt)
    #Update num_w
    num_w = utt.num_words_no_pau(comma_is_pause)
    if len(leafs) != num_w:
      for w in utt.words:
        print w.id
      raise SiReError("Number of leaves ({0}) not equal to number of words ({1})! In utt ({2})!".format(len(leafs), num_w, utt.id))
  #Match each word with parse
  words = utt.get_words_no_pau(comma_is_pause)
  for i, word in enumerate(words):
    l = leafs[i].label.split("-")
    word.id = l[1]
    word.pos = l[0]
    #There should always be a parent
    word.parent_phrase = leafs[i].parent
    #But there might not be more than one
    if word.parent_phrase.parent != None:
      word.grandparent_phrase = word.parent_phrase.parent
    else:
      word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse()
    #And certainly we might be done here
    if word.grandparent_phrase.parent in [None, "xx"] or word.grandparent_phrase.parent.label == "xx":
      word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse()
    else:
      word.greatgrandparent_phrase = word.grandparent_phrase.parent
  
  #Now add fake parse for sil, pau and #
  for word in utt.words:
    if word.id in utt.phoneme_features.get_sil_phonemes():
      word.parent_phrase = parsetrees.get_fake_stanford_pcfg_parse()
      word.grandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse()
      word.greatgrandparent_phrase = parsetrees.get_fake_stanford_pcfg_parse()
      word.pos = "sil"
Esempio n. 3
0
def load_stanford_dependency_parse(utt, parse):
    if utt.words == None:
        raise SiReError(
            "No words in utterance! Please load an mlf or txt file first!")
    tree = parsetrees.stanfordDependencyTree()
    tree.make_tree(parse)
    #As each word is at a node not at a leaf we get the nodes.
    nodes = tree.get_nodes(utt_sorted=True)
    if len(nodes) != utt.num_words_no_pau():
        #First we try to see if this is due to differences in how words are
        #dealt with in parsing and annotation.
        #Prime example is using 's in e.g. there's for transcription instead of there is.
        #Parsing splits there's into two whereas in e.g. combilex there's is one word.
        #If this is the case we split the WORD into two with the 's being a single phoneme
        #single syllable word. In other cases the contraction straddles two words and
        #we add a "phony" word which affects contexts but adds no phonemes.
        utterance_utils.try_split_words(utt)
        if len(nodes) != utt.num_words_no_pau():
            for node in nodes:
                print node.label
            raise SiReError(
                "Number of nodes ({0}) not equal to number of words ({1})! In utt ({2})!"
                .format(len(nodes), utt.num_words_no_pau(), utt.id))
    #Match each word with parse
    for i, word in enumerate(utt.get_words_no_pau()):
        #As we may have split words the parse contains the id
        word.id = nodes[i].label
        #But as we may have punctuation the word itself contains the utt_pos
        nodes[i].utt_pos = word.pos_in_utt()
        #There should always be itself
        word.parent_dependency = nodes[i]
        #And there should always be a parent
        word.grandparent_dependency = word.parent_dependency.parent
        #But there might not be more than one
        if word.grandparent_dependency.parent != None:
            word.greatgrandparent_dependency = word.grandparent_dependency.parent
        else:
            word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree(
            )

    #Now add empty parse for sil, pau and #
    for word in utt.words:
        if word.id in utt.phoneme_features.get_sil_phonemes() + [","]:
            word.parent_dependency = parsetrees.stanfordDependencyTree()
            word.grandparent_dependency = parsetrees.stanfordDependencyTree()
            word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree(
            )
Esempio n. 4
0
def load_stanford_dependency_parse(utt, parse):
  if utt.words == None:
    raise SiReError("No words in utterance! Please load an mlf or txt file first!")
  tree = parsetrees.stanfordDependencyTree()
  tree.make_tree(parse)
  #As each word is at a node not at a leaf we get the nodes.
  nodes = tree.get_nodes(utt_sorted=True)
  if len(nodes) != utt.num_words_no_pau():
    #First we try to see if this is due to differences in how words are
    #dealt with in parsing and annotation. 
    #Prime example is using 's in e.g. there's for transcription instead of there is.
    #Parsing splits there's into two whereas in e.g. combilex there's is one word.
    #If this is the case we split the WORD into two with the 's being a single phoneme
    #single syllable word. In other cases the contraction straddles two words and
    #we add a "phony" word which affects contexts but adds no phonemes.
    utterance_utils.try_split_words(utt)
    if len(nodes) != utt.num_words_no_pau():
      for node in nodes:
        print node.label
      raise SiReError("Number of nodes ({0}) not equal to number of words ({1})! In utt ({2})!".format(len(nodes), utt.num_words_no_pau(), utt.id))
  #Match each word with parse
  for i, word in enumerate(utt.get_words_no_pau()):
    #As we may have split words the parse contains the id
    word.id = nodes[i].label
    #But as we may have punctuation the word itself contains the utt_pos
    nodes[i].utt_pos = word.pos_in_utt()
    #There should always be itself
    word.parent_dependency = nodes[i]
    #And there should always be a parent
    word.grandparent_dependency = word.parent_dependency.parent
    #But there might not be more than one
    if word.grandparent_dependency.parent != None:
      word.greatgrandparent_dependency = word.grandparent_dependency.parent
    else:
      word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree()
  
  #Now add empty parse for sil, pau and #
  for word in utt.words:
    if word.id in utt.phoneme_features.get_sil_phonemes()+[","]:
      word.parent_dependency = parsetrees.stanfordDependencyTree()
      word.grandparent_dependency = parsetrees.stanfordDependencyTree()
      word.greatgrandparent_dependency = parsetrees.stanfordDependencyTree()