Example #1
0
def random_tags(target, source, env):
    num_tags = env["NUM_TAGS"]
    style_name = source[1].read()
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)[-1]
    if style_name == "type-based":
        wordIndexToTag = {i : randint(0, num_tags) for i in data.indexToWord.keys()}
        new_data = DataSet.from_sentences([[(data.indexToWord[w], str(wordIndexToTag[w]), [data.indexToAnalysis[a] for a in aa]) for w, t, aa in s] for s in data.sentences])
    else:
        new_data = DataSet.from_sentences([[(data.indexToWord[w], str(randint(0, num_tags)), [data.indexToAnalysis[a] for a in aa]) for w, t, aa in s] for s in data.sentences])
    with meta_open(target[0].rstr(), "w") as ofd:
        new_data.write(ofd)
    return None
Example #2
0
def conllish_to_xml(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        sentences = [[(w, t, []) for w, t in [re.split(r"\s+", x) for x in s.split("\n") if not re.match(r"^\s*$", x)]] for s in re.split(r"\n\n", ifd.read(), flags=re.M)]
    data = DataSet.from_sentences(sentences)
    with meta_open(target[0].rstr(), "w") as ofd:
        data.write(ofd)
    return None
Example #3
0
def rtm_to_data(target, source, env):
    sentences = []
    with meta_open(source[0].rstr()) as ifd:
        for sentence in ifd:
            words = [w for w in sentence.split()[5:] if w not in ["(())", "IGNORE_TIME_SEGMENT_IN_SCORING"]]
            if len(words) > 0:
                sentences.append(words)
    dataset = DataSet.from_sentences([[(w, None, []) for w in s] for s in sentences])
    with meta_open(target[0].rstr(), "w") as ofd:
        dataset.write(ofd)
    return None
Example #4
0
def add_morphology(target, source, env):
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)[-1]
    morphology = {}
    with meta_open(source[1].rstr()) as ifd:        
        for l in ifd:
            word, analyses = l.split("\t")
            morphology[word] = set()
            for analysis in analyses.split(", "):
                morphology[word].add(tuple([morph.split(":")[0] for morph in analysis.split() if not morph.startswith("~")]))

    #print [[(data.indexToWord[w], data.indexToTag.get(t, None), morphology.get(data.indexToWord[w], [])) for w, t, aa in s] for s in data.sentences][0:10]
    new_data = DataSet.from_sentences([[(data.indexToWord[w], data.indexToTag.get(t, None), get_without_case(data.indexToWord[w], morphology)) for w, t, aa in s] for s in data.sentences])
    with meta_open(target[0].rstr(), "w") as ofd:
        new_data.write(ofd)
    return None
Example #5
0
def random_segmentations(target, source, env):
    def get_random_segmentation(w):
        stem_length = randint(1, len(w))
        prefix_length = randint(0, len(w) - stem_length)
        suffix_length = randint(0, len(w) - (stem_length + prefix_length))
        prefix = w[:prefix_length]
        stem = w[prefix_length:prefix_length + stem_length]
        suffix = w[prefix_length + stem_length:]
        return tuple([x for x in [prefix, stem, suffix] if len(x) > 0])
    style_name = source[1].read()
    with meta_open(source[0].rstr()) as ifd:
        data = DataSet.from_stream(ifd)[-1]
    if style_name == "type-based":
        wordIndexToAnalysis = {}
        for i, w in data.indexToWord.iteritems():
            wordIndexToAnalysis[i] = get_random_segmentation(w)
        sentences = [[(data.indexToWord[w], data.indexToTag.get(t, None), [wordIndexToAnalysis[w]]) for w, t, aa in s] for s in data.sentences]
    else:        
        sentences = [[(data.indexToWord[w], data.indexToTag.get(t, None), [get_random_segmentation(data.indexToWord[w])]) for w, t, aa in s] for s in data.sentences]
    new_data = DataSet.from_sentences(sentences)
    with meta_open(target[0].rstr(), "w") as ofd:
        new_data.write(ofd)
    return None