Ejemplo n.º 1
0
def load_data_from_file(f, word_rep, pos_rep, tag_rep, representation="1", limit=8, n_seq=None):
    """Loads from file into five lists of arrays of equal length:
    one for utterance iDs (IDs))
    one for the timings of the tags (start, stop)
    one for words (seq), 
    one for pos (pos_seq) 
    one for tags (targets).
    
    Converts them into arrays of one-hot representations."""
     
    print "loading data", f.name
    count_seq = 0
    #count_step = 0
    IDs = []
    seq = []
    pos_seq = []
    targets = []
    timings = []

    reader=csv.reader(f,delimiter='\t')
    counter = 0
    utt_reference = ""
    currentWords = []
    currentPOS = []
    currentTags = []
    currentTimings = []
    current_fake_time = 0 # marks the current fake time for the dialogue (i.e. end of word)
    current_dialogue = ""
    
    #corpus = "" # can write to file
    for ref,timing,word,postag,disftag in reader: #mixture of POS and Words
        #TODO, for now 'fake' timing will increment by one each time
        counter+=1
        
        if not ref == "":
            if count_seq>0: #do not reset the first time
                #convert to the inc tags
                if "0" in representation: #turn taking only
                    currentTags = [""] * len(currentTags)
                else:
                    currentTags = convert_from_eval_tags_to_inc_disfluency_tags(currentTags, currentWords, representation=representation, limit=limit)
                if 'trp' in representation:
                    currentTags = add_word_continuation_tags(currentTags)
                if 'simple' in representation:
                    currentTags = map(lambda x : convert_to_simple_label(x,rep=representation), currentTags)
                #corpus+=utt_reference #write data to a file for checking
                #convert to vectors
                words = []
                pos_tags = []
                tags = []
                for i in range(0,len(currentTags)):
                    w = word_rep.get(currentWords[i])
                    pos = pos_rep.get(currentPOS[i])
                    tag = tag_rep.get(currentTags[i]) # NB POS tags in switchboard at l[2]
                    if w == None:
                        logging.info("No word rep for :" + currentWords[i])
                        w = word_rep.get("<unk>")
                    if pos == None:
                        logging.info("No pos rep for :" + currentPOS[i])
                        pos = pos_rep.get("<unk>")
                    if tag == None:
                        logging.info("No tag rep for:" + currentTags[i])
                        print utt_reference, currentTags, words
                        raise Exception("No tag rep for:" + currentTags[i])
                    words.append(w)
                    pos_tags.append(pos)
                    tags.append(tag)
                x = np.asarray(words)
                p = np.asarray(pos_tags)
                y = np.asarray(tags)
                seq.append(x)
                pos_seq.append(p)
                targets.append(y)
                IDs.append(utt_reference)
                timings.append(tuple(currentTimings))
                #reset the words
                currentWords = []
                currentPOS = []
                currentTags = []
                currentTimings = []
            #set the utterance reference
            count_seq+=1
            utt_reference = ref
            if not utt_reference.split(":")[0] == current_dialogue:
                current_dialogue = utt_reference.split(":")[0]
                current_fake_time = 0 #TODO fake for now- reset the current beginning of word time
        currentWords.append(word)
        currentPOS.append(postag)
        currentTags.append(disftag)
        currentTimings.append((current_fake_time,current_fake_time+1))
        current_fake_time+=1
    #flush
    if not currentWords == []:
        if "0" in representation: #turn taking only
            currentTags = [""] * len(currentTags)
        else:
            currentTags = convert_from_eval_tags_to_inc_disfluency_tags(currentTags, currentWords, representation=representation, limit=limit)
        if 'trp' in representation:
            currentTags = add_word_continuation_tags(currentTags)
        if 'simple' in representation:
            currentTags = map(lambda x : convert_to_simple_label(x,rep=representation), currentTags)
        words = []
        pos_tags = []
        tags = []
        for i in range(0,len(currentTags)):
            w = word_rep.get(currentWords[i])
            pos = pos_rep.get(currentPOS[i])
            tag = tag_rep.get(currentTags[i]) # NB POS tags in switchboard at l[2]
            if w == None:
                logging.info("No word rep for :" + currentWords[i])
                w = word_rep.get("<unk>")
            if pos == None:
                logging.info("No pos rep for :" + currentPOS[i])
                pos = pos_rep.get("<unk>")
            if tag == None:
                logging.info("No tag rep for:" + currentTags[i])
                print utt_reference, currentTags, words
                raise Exception("No tag rep for:" + currentTags[i])
            words.append(w)
            pos_tags.append(pos)
            tags.append(tag)
        x = np.asarray(words)
        p = np.asarray(pos_tags)
        y = np.asarray(tags)
        seq.append(x)
        pos_seq.append(p)
        targets.append(y)
        IDs.append(utt_reference)
        timings.append(tuple(currentTimings))
        
    assert len(seq) == len(targets) == len(pos_seq)
    print "loaded " + str(len(seq)) + " sequences"
    f.close()
    return (IDs,timings,seq,pos_seq,targets)
Ejemplo n.º 2
0
def get_tag_data_from_corpus_file(f, representation="1", limit=8):
    """Loads from file into five lists of lists of strings of equal length:
    one for utterance iDs (IDs))
    one for word timings of the targets (start,stop)
    one for words (seq), 
    one for pos (pos_seq) 
    one for tags (targets).
     
    NB this does not convert them into one-hot arrays, just outputs lists of string tags in GOLD form."""
     
    f = open(f)
    print "loading data", f.name
    count_seq = 0
    IDs = []
    seq = []
    pos_seq = []
    targets = []
    timings = []
    currentTimings = []
    current_fake_time = 0 # marks the current fake time for the dialogue (i.e. end of word)
    current_dialogue = ""
    
    reader=csv.reader(f,delimiter='\t')
    counter = 0
    utt_reference = ""
    currentWords = []
    currentPOS = []
    currentTags = []
    current_fake_time = 0
    
    #corpus = "" # can write to file
    for ref,timing,word,postag,disftag in reader: #mixture of POS and Words
        counter+=1
        if not ref == "":
            if count_seq>0: #do not reset the first time
                #convert to the inc tags
                #currentTags = convertFromEvalTagsToIncDisfluencyTags(currentTags, currentWords, representation, limit)
                if 'trp' in representation:
                    currentTags = add_word_continuation_tags(currentTags)
                if 'simple' in representation:
                    currentTags = map(lambda x : convert_to_simple_label(x), currentTags)
                #corpus+=utt_reference #write data to a file for checking
                #convert to vectors
                seq.append(tuple(currentWords))
                pos_seq.append(tuple(currentPOS))
                targets.append(tuple(currentTags))
                IDs.append(utt_reference)
                timings.append(tuple(currentTimings))
                #reset the words
                currentWords = []
                currentPOS = []
                currentTags = []
                currentTimings = []
            #set the utterance reference
            count_seq+=1
            utt_reference = ref
            if not utt_reference.split(":")[0] == current_dialogue:
                current_dialogue = utt_reference.split(":")[0]
                current_fake_time = 0 #TODO fake for now- reset the current beginning of word time
        currentWords.append(word)
        currentPOS.append(postag)
        currentTags.append(disftag)
        currentTimings.append((current_fake_time,current_fake_time+1))
        current_fake_time+=1
    #flush
    if not currentWords == []:
        #currentTags = convertFromEvalTagsToIncDisfluencyTags(currentTags, currentWords, limit=8)
        if 'trp' in representation:
            currentTags = add_word_continuation_tags(currentTags)
        if 'simple' in representation:
            currentTags = map(lambda x : convert_to_simple_label(x), currentTags)
        seq.append(tuple(currentWords))
        pos_seq.append(tuple(currentPOS))
        targets.append(tuple(currentTags))
        IDs.append(utt_reference)
        timings.append(currentTimings)
        
    assert len(seq) == len(targets) == len(pos_seq)
    print "loaded " + str(len(seq)) + " sequences"
    f.close()
    return (IDs,timings,seq,pos_seq,targets)
Ejemplo n.º 3
0
def create_tag_representations(tag_rep_filepath,
                               tags,
                               representation="disf1",
                               tag_corpus_file=None,
                               limit=8):
    """Create the tag files for a given corpus f with a given
    representation type.

    Keyword arguments:
    tag_rep_file --  file to write the tag rep too.
    Note this must be a training file.
    tags -- list of lists of tags (training)
    representation -- string showing the type of tagging system,
    1=standard, 2=rm-N values where N does not count intervening edit terms
    3=same as 2 but with a 'c' tag after edit terms have ended.
    """
    tag_dict = defaultdict(int)  # tag and the number of times it occurs
    print "creating tag file:", representation, "..."
    # print len(tags)
    if tag_corpus_file:
        print "(and creating tag corpus file)"
        tag_corpus_file = open(tag_corpus_file, "w")

    i = 0
    tag_corpus = ""
    for tag_sequence in tags:
        i += 1
        # print i
        # print len(tag_sequence)
        for a_tag in tag_sequence:
            tag = ""
            subtags = get_tags(a_tag)
            if "disf" in representation:
                for t in subtags:
                    if not re.search(r'<[ct]*/>', t)\
                            and not re.search(r'<diact type="[^\s]*"/>', t)\
                            and not re.search(
                                r'<speechLaugh/>|<laughter/>', t)\
                            and not re.search(r'<speaker floor="[^\s]*"/>', t):
                        if "<speaker" in t:
                            print "WARNING speaker getting through"
                        tag += t
            if "disf" in representation and "simple" in representation:
                tag = convert_to_simple_label(tag, "disf1")
            if "dact" in representation:
                m = re.search(r'<diact type="[^\s]*"/>', a_tag)
                if m:
                    tag += m.group(0)
            if "laugh" in representation:
                m = re.search(r'<speechLaugh/>|<laughter/>', a_tag)
                if m:
                    tag += m.group(0)
                else:
                    tag += "<nolaughter/>"
            if "uttseg" in representation:
                m = re.search(r'<[ct]*/>', a_tag)
                if m:
                    tag += m.group(0)
                else:
                    if "<laugh" in a_tag:
                        continue
                    print "No utt seg found", a_tag
                    continue
            if tag == "":
                if "<laugh" not in a_tag:
                    print "warning no tag", a_tag
                continue
            if "interactive" in representation:
                if "speaker" in tag:
                    print "in tag already", a_tag, tag
                m = re.search(r'<speaker floor="[^\s]*"/>', a_tag)
                if m:
                    # if "<speaker" in tag:
                    tag += m.group(0)
            if ("uttseg" not in representation) and "t/>" in a_tag:
                # non segmented mode
                if "<speaker" in a_tag:
                    # only add tag at end as a single tag if interactive
                    tag_corpus += tag + ","
                else:
                    tag_corpus += tag + "\n"
            # do segmentation last as it might not be segmented
            tag_dict[tag] += 1
            if ("uttseg" not in representation) and "<speaker" in a_tag:
                continue  # i.e. if interactive treat as a single tag
            if ("uttseg" not in representation):
                m = re.search(r'<[ct]*/>', a_tag)
                if m and "t/>" in m.group(0):
                    tag_corpus = tag_corpus.strip(",") + "\n"
                    continue
            tag_corpus += tag + ","
        tag_corpus = tag_corpus.strip(",") + "\n"
        # new line separated dialogue/speakers
    if tag_corpus_file:
        tag_corpus_file.write(tag_corpus)
        tag_corpus_file.close()
    print tag_dict
    tagstring = "\n".join([
        str(i) + "," + str(sorted(tag_dict.keys())[i])
        for i in range(0, len(tag_dict.keys()))
    ])
    tag_rep_file = open(tag_rep_filepath, "w")
    tag_rep_file.write(tagstring)
    tag_rep_file.close()
    print "tag file complete."