Example #1
0
    def __init__(self, loc=MOHX_LOCATION):
        self.instances, self.words = [], []

        c = 0
        for line in open(loc).readlines()[1:]:
            sentence = Corpus.Sentence()
            data = line.split(",")
            sentence.id = str(c)
            c += 1
            word_data = data[3].split()

            for i in range(len(word_data)):
                met = "N"
                if i == int(data[-2]):
                    met = "tag-" + data[-1].strip()
                w = Corpus.Word(text=word_data[i],
                                met=met,
                                sentence=sentence,
                                index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)

        Corpus.add_dependencies(self.instances, MOHX_DEPS, lex_field=1)
Example #2
0
    def __init__(self):
        super().__init__()
        self.instances, self.words = [], []
        lemmatizer = WordNetLemmatizer()
        cur_verb, cluster = "", ""

        for line in open(TROFI_LOCATION).readlines():
            if re.match(r"\*\*\*[a-z]", line):
                cur_verb = line.split("***")[1]
                continue
            elif "*" in line or not line.strip():
                if "literal" in line:
                    cluster = "literal"
                elif "nonliteral" in line:
                    cluster = "nonliteral"
                continue

            sentence = Corpus.Sentence()
            data = line.strip().split("\t")
            sentence.id = data[0]

            met = ""
            if "N" in data[1]:
                met = "met"
            if "L" in data[1]:
                met = "N"
            if "U" in data[1]:
                met = "?"

            for i in range(len(data[2].split())):
                word = data[2].split()[i]
                v_lem = lemmatizer.lemmatize(word, "v")
                cur_met = "N"
                if v_lem == cur_verb:
                    cur_met = "tag-" + met
                w = Corpus.Word(text=word,
                                met=cur_met,
                                sentence=sentence,
                                index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)

        Corpus.add_dependencies(self.instances, TROFI_DEPS, lex_field=1)
Example #3
0
    def __init__(self, corpus_location):
        self.instances, self.words = [], []
        data = csv.reader(open(corpus_location))
        next(data)
        for line in data:
            sentence = Corpus.Sentence()
            sentence.id = line[1]

            index = int(line[-2])
            tag = int(line[-1])

            sent_data = line[3].split()
            for i in range(len(sent_data)):
                word = sent_data[i]
                met = "N"
                if i == index:
                    met = "met"
                w = Corpus.Word(text=word, sentence=sentence, met=met, index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)
Example #4
0
def load_vuamc_csv(filename=VUAMC_CSV):
    with codecs.open(filename, encoding="latin-1", errors='replace') as f:
        data = [line for line in csv.reader(f)]

    sentences = []
    all_words = []

    for sent_index in range(1, len(data[1:])):
        line_data = data[sent_index]
        if not line_data:
            continue
        sentence = Corpus.Sentence()

        sentence.source_file = line_data[0]

        if sentence.source_file in ACADEMIC:
            sentence.domain = "academic"
        elif sentence.source_file in CONVERSATION:
            sentence.domain = "conversation"
        elif sentence.source_file in FICTION:
            sentence.domain = "fiction"
        elif sentence.source_file in NEWS:
            sentence.domain = "news"
        sentence.id = line_data[1]

        words = line_data[2]

        j = 0
        for i in range(0, len(words.split())):
            w_data = words.split()[i].split(";;")
            if "M_" in w_data[-1]:
                met = "met"
                word_text = w_data[-1][2:]
            else:
                met = "N"
                word_text = w_data[-1]

            pos = w_data[0]
            lemma = w_data[1]

            for extra_words in word_text.split("_"):
                if not set(extra_words).intersection(
                        str(string.punctuation + string.ascii_letters +
                            string.digits)):
                    sentence.words.append(
                        Corpus.Word(text="none",
                                    met="none",
                                    pos="none",
                                    lemma="none",
                                    sentence=sentence,
                                    index=j))
                    j += 1
                    continue

                word = Corpus.Word(text=extra_words,
                                   met=met,
                                   pos=pos,
                                   lemma=lemma,
                                   sentence=sentence,
                                   index=j)

                sentence.words.append(word)
                all_words.append(word)
                j += 1
        sentences.append(sentence)

    Corpus.add_dependencies(sentences, VUAMC_DEPS)
    Corpus.add_vn_parse(sentences, VUAMC_VN)
    Corpus.add_allen_parse(sentences, VUAMC_ALLEN)
    #Corpus.populate_vn_from_heads(sentences)

    return sentences, all_words