Example #1
0
def afterTurbo(data, allorthree):
    sents = [sent for sent in parse_incr(data)]
    predicted = open("tmp/predicted.conll", 'r')
    predsents = [sent for sent in parse_incr(predicted)]
    sentidx3nouns = [i for i in range(len(sents)) if has3Nouns(sents[i])]
    #avg = sum([len(sents[i]) for i in sentidx3nouns])/len(sentidx3nouns)
    #print(sum([len(sent) for sent in sents])/len(sents))
    #print(avg)
    #sentidx3nouns = [i for i in range(len(sents)) if len(sents[i])>=17]

    if allorthree:
        sents = [sents[i] for i in sentidx3nouns]
        predsents = [predsents[i] for i in sentidx3nouns]
        #print(sum([len(sent) for sent in sents])/len(sents))

    if len(sents) != len(predsents):
        print("number of sentences is different from predicted sentences")
    toteq = 0
    totneq = 0
    for (tlist1, tlist2) in zip(sents, predsents):
        eq, neq = compareTlist(tlist1, tlist2)
        toteq += eq
        totneq += neq
    file = open("tmp/testresults.txt", 'w')
    file.write(str(toteq) + "\n")
    file.write(str(totneq))
    print(toteq / (totneq + toteq))
Example #2
0
def get_word_parsing(dev_path=None, test_path=None, train_path=None):
    # 新版本ud 但fac里标记的旧版
    if dev_path is None:
        dev_path = "../unified/uds/UD-EWT/en_ewt-ud-dev.conllu"
    if test_path is None:
        test_path = "../unified/uds/UD-EWT/en_ewt-ud-test.conllu"
    if train_path is None:
        train_path = "../unified/uds/UD-EWT/en_ewt-ud-train.conllu"
    print("-----------OpenUdConllu------------")
    dev = open(dev_path, "r", encoding="utf-8")
    test = open(test_path, "r", encoding="utf-8")
    train = open(train_path, "r", encoding="utf-8")
    dev_data = []
    test_data = []
    train_data = []
    print("-----------OpenUd---dev---------")
    for tokenlist in parse_incr(dev):
        dev_data.append(tokenlist)
    print("-----------OpenUd---test---------")
    for tokenlist in parse_incr(test):
        test_data.append(tokenlist)
    print("-----------OpenUd---train---------")
    for tokenlist in parse_incr(train):
        train_data.append(tokenlist)

    ud_counter = defaultdict(Counter)
    ud_counter['en-ud-dev.conllu'] = dev_data
    ud_counter['en-ud-test.conllu'] = test_data
    ud_counter['en-ud-train.conllu'] = train_data

    return ud_counter
Example #3
0
def parse_wsd_data():

    # parse the EUD-EWT conllu files and retrieve the sentences
    # remove all punctuation?
    train_file = open("data/UD_English-EWT/en_ewt-ud-train.conllu",
                      "r",
                      encoding="utf-8")
    train_data = list(parse_incr(train_file))
    # train_data = [[''.join(c for c in word.get('lemma') if c not in string.punctuation) for word in token_list] for token_list in train_data]
    # train_data = [[word for word in s if word] for s in train_data]
    print(
        'Parsed {} training data from UD_English-EWT/en_ewt-ud-train.conllu.'.
        format(len(train_data)))

    test_file = open("data/UD_English-EWT/en_ewt-ud-test.conllu",
                     "r",
                     encoding="utf-8")
    test_data = list(parse_incr(test_file))
    # test_data = [[''.join(c for c in word.get('lemma') if c not in string.punctuation) for word in token_list] for token_list in test_data]
    # test_data = [[word for word in s if word] for s in test_data]
    print('Parsed {} testing data from UD_English-EWT/en_ewt-ud-test.conllu.'.
          format(len(test_data)))

    dev_file = open("data/UD_English-EWT/en_ewt-ud-dev.conllu",
                    "r",
                    encoding="utf-8")
    dev_data = list(parse_incr(dev_file))
    # dev_data = [[''.join(c for c in word.get('lemma') if c not in string.punctuation) for word in token_list] for token_list in dev_data]
    # dev_data = [[word for word in s if word] for s in dev_data]
    print(
        'Parsed {} dev data from UD_English-EWT/en_ewt-ud-dev.conllu.'.format(
            len(dev_data)))

    # parse the WSD dataset
    wsd_data = []

    # read in tsv by White et. al., 2016
    with open('data/wsd/wsd_eng_ud1.2_10262016.tsv', mode='r') as wsd_file:

        tsv_reader = csv.DictReader(wsd_file, delimiter='\t')

        # store the data: ordered dict row
        for row in tsv_reader:

            # each data vector
            wsd_data.append(row)

        # make sure all data are parsed
        print('Parsed {} word sense data from White et. al., 2016.'.format(
            len(wsd_data)))

    return wsd_data, train_data, test_data, dev_data
Example #4
0
def test_accuracy(test_file, probabilities):
    seen = 0
    correct = 0

    print("> Starting tests")
    with open(test_file, "r", encoding="utf-8") as tf:
        for sentence in parse_incr(tf):
            previous = None
            for word in sentence:
                if previous is not None:
                    previous_pos = previous["upos"]
                    pos = word["upos"]
                    possibilities = ensure(probabilities, previous_pos)
                    most_likely = possibilities[0][0]
                    if pos == most_likely:
                        correct += 1
                    seen += 1
                previous = word

    print("< Finishing tests")
    print()

    print(f"Total: {seen} words")
    print(f"Correct: {correct} PoS")
    print(f"Accuracy: {correct * 100 / seen} %")
Example #5
0
def non_projectivity_rate(train):
    sequences = []
    for tokenlist in parse_incr(train):
        sequences.append(tokenlist)
    cpt = 0
    non_proj = 0
    taille = 0
    taille2 = 0
    # mot1 = []
    # mot2 = []

    taille2 = len(sequences)
    for i in range(len(sequences)):
        sentence = sequences[i]
        taille += len(sequences[i])
        mot1 = []
        mot2 = []
        for j in range(len(sequences[i])):
            # taille2 += len(sequences[j])
            token = sentence[j]
            if token['head'] != None:
                mot1.append(token['id'])
                mot2.append(token['head'])
            # print(len(mot1))
        for k in range(len(mot1)):
            for l in range(1, len(mot1)):
                if mot2[l] > mot1[k] and mot2[l] < mot2[k]:
                    non_proj += 1
                    break

    # return non_proj
    return float(non_proj / taille)
Example #6
0
def main():
    docids = []
    file = open("yourfile.conllu", 'r', encoding="utf-8")
    for tokenlist in parse_incr(file):
        docids.append(tokenlist.metadata['sent_id'])

    trainfile = open("train.txt")
    sentences = open("sentences.txt", "w")
    num = 0
    for line in trainfile:
        line = line.split("\t")
        if line[0] in docids:
            pass
        else:
            num = num + 1
            #Remove annotations from data
            sent = line[2].replace(']', '')
            sent = sent.strip()
            sent = sent.split(" ")
            new_sent = []
            for item in sent:
                if item.startswith("["):
                    pass
                else:
                    new_sent.append(item)
            cleansent = " ".join(new_sent)
            sentences.write(cleansent)
            sentences.write('\n')
    print(num)
Example #7
0
def get_sentences(stressed=False):
    for fp in get_dataset_connlu_files():
        first_tokenlist = True
        for tokenlist in parse_incr(fp):
            first_token = True
            sentence = ''
            for token in tokenlist:
                if not first_token or not first_tokenlist and tokenlist.metadata[
                        'newpar id']:
                    sentence += '\n'

                form = token['misc'][
                    'StressedForm'] if stressed and 'misc' in token and token[
                        'misc'] and 'StressedForm' in token['misc'] else token[
                            'form']
                if form == '<g/>':
                    continue

                sentence += form

                try:
                    no_space_after = token['misc']['SpaceAfter'] == 'No'
                except (TypeError, KeyError):
                    no_space_after = False

                if not no_space_after:
                    sentence += ' '

            yield sentence

            first_token = False
        first_tokenlist = False
Example #8
0
def split_to_conllu_corpus(input_file, output_dir_path, num_of_sent=1000):
    """
    말뭉치를 분할
    :param input_file:
    :param output_dir_path:
    :param num_of_sent:
    :return:
    """
    def open_output_file(_output_dir_path, _output_file_name, _file_index):
        return open(_output_dir_path + '\\' + _output_file_name + '_' +
                    str(_file_index).zfill(3) + '.conllu',
                    'w',
                    encoding='utf-8-sig')

    file_name = os.path.splitext(input_file.name)
    file_name = os.path.split(file_name[0])
    output_file_name = file_name[1]
    file_index = 0
    _num_of_sent = 0
    output_file = open_output_file(output_dir_path, output_file_name,
                                   file_index)

    for tokenlist in parse_incr(input_file):

        if _num_of_sent >= num_of_sent:
            file_index += 1
            _num_of_sent = 0
            output_file = open_output_file(output_dir_path, output_file_name,
                                           file_index)

        _num_of_sent += 1
        print(tokenlist.serialize().strip(), file=output_file)
        print(file=output_file)
Example #9
0
def load():
    import conllu

    data_file = open("cs-ud-train-l.conllu", "r", encoding="utf-8")
    # parse_incr is recommended for large files (more than 1 MB)
    # since it returns a generator, which is why a conversion to list
    tokenlist = list(conllu.parse_incr(data_file))

    for sentence in tokenlist:
        for token in sentence:
            form_lemma = token['form'] + token['lemma']

    for sentence in tokenlist:
        chain = []
        for token in sentence:
            if token['head']:
                parent = sentence[token['head'] - 1]
                if token['deprel'] == "case" and parent['deprel'] == "nmod":
                    chain.append(token)

    for sentence in tokenlist:
        for token in sentence:
            token['deprel'] = 'dep'

    with open('out.conllu', 'w', encoding='utf8') as f:
        f.writelines([sentence.serialize() + "\n" for sentence in tokenlist])
Example #10
0
    def tokenize(self, file_name):
        all_ud_tokens = []
        all_ud_data = []

        count_del, count_total = 0, 0

        # Initialise all the trees and embeddings
        with open(file_name, "r", encoding="utf-8") as file:
            for token_list in parse_incr(file):

                ud_tokens = []
                ud_data = []

                for item in token_list:
                    ud_tokens.append(item['form'])
                    ud_data.append({
                        'word': item['form'],
                        'pos': item['upostag'],
                        'head': item['head'],
                        'rel': item['deprel'],
                    })

                # If there are more than max_tokens tokens skip the sentence
                if len(ud_tokens) <= self.max_tokens:
                    all_ud_tokens.append(ud_tokens)
                    all_ud_data.append(ud_data)
                else:
                    count_del += 1
                count_total += 1

        if count_del > 0:
            print('\n\n\tWarning!Removed %d (of %d) long sentences\n\n' %
                  (count_del, count_total))
        return all_ud_tokens, all_ud_data
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as conllu_file:
            logger.info("Reading UD instances from conllu dataset at: %s",
                        file_path)

            for annotation in parse_incr(conllu_file):
                # CoNLLU annotations sometimes add back in words that have been elided
                # in the original sentence; we remove these, as we're just predicting
                # dependencies for the original sentence.
                # We filter by integers here as elided words have a non-integer word id,
                # as parsed by the conllu python library.
                annotation = [
                    x for x in annotation if isinstance(x["id"], int)
                ]

                heads = [x["head"] for x in annotation]
                tags = [x["deprel"] for x in annotation]
                words = [x["form"] for x in annotation]
                if self.use_language_specific_pos:
                    pos_tags = [x["xpostag"] for x in annotation]
                else:
                    pos_tags = [x["upostag"] for x in annotation]
                yield self.text_to_instance(words, pos_tags,
                                            list(zip(tags, heads)))
Example #12
0
def download_embeddings(tmp_download_path, embeddings_save_path,
                        dataset_file_paths):
    download_model('el', tmp_download_path, if_exists='ignore')
    ft = fasttext.load_model(f'{tmp_download_path}/cc.el.300.bin')

    if not dataset_file_paths:
        dataset_file_paths = [
            f'data/ud/{ds}.conllu' for ds in ('train', 'dev', 'test')
        ]

    vocab = set()
    for p in dataset_file_paths:
        with open(p) as fr:
            for e in parse_incr(fr):
                for t in e:
                    vocab.add(t['form'].lower())

    word_vectors = []
    i2w = list(vocab)
    for word in i2w:
        word_vectors.append(ft.get_word_vector(word))
    word_vectors = [[0] * len(word_vectors[0])] + word_vectors
    i2w = ['<PAD>'] + i2w
    w2i = {w: i for i, w in enumerate(i2w)}

    with open(embeddings_save_path, 'wb') as fw:
        pickle.dump((np.array(word_vectors), w2i, i2w), fw)
Example #13
0
def write_file_for_main_POS():
    """
    To extract sentences where all POS of a certain type are grouped.
    :return:
        """
    tags_of_interest = [
        'ADJ', 'ADV', 'NOUN', 'PRON', 'VERB', 'AUX', 'DET', 'PROPN'
    ]

    out_file_path = os.path.basename(path_to_conllu_file)[:-7] + '_POS.csv'

    with open('data/' + out_file_path, 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow([
            '{}|{} {}'.format('#' if i == 0 else '', i, t)
            for i, t in enumerate(tags_of_interest)
        ])

        for sentence in parse_incr(
                open(path_to_conllu_file, "r", encoding="utf-8")):
            token_forms = []
            for token in sentence:
                if token["upostag"] in tags_of_interest:
                    index = tags_of_interest.index(token["upostag"])
                    token_forms.append('|{} {} |'.format(
                        index, token["form"].replace('|', '')))
                else:
                    token_forms.append(token["form"])
            writer.writerow([' '.join(token_forms)])
def load_check_data(check_file):
    with open(check_file, 'r', encoding='utf-8') as cf:
        conllulist = []
        for tokenlist in cl.parse_incr(cf):
            conllulist.append(tokenlist) 
    sentlist = [[token['form'] for token in sent] for sent in conllulist]
    return conllulist, sentlist
Example #15
0
def parse_and_extract(conllu_path) -> List[Tuple[Inp, Out]]:
    """Parse a CoNLL-U file and return the list of input/output pairs."""
    data = []
    with open(conllu_path, "r", encoding="utf-8") as data_file:
        for token_list in conllu.parse_incr(data_file):  # type: ignore
            data.append(extract(token_list))
    return data
def extract_sentences(path, filenames, split='train'):
    """Extract word-tag lists of sentences from downloaded conllu files.

    Args:
        path (str): The path to the stored data.
        filenames (str): The name of the files for the current treebank.
        split (optional, str): The split to be returned:`train`,`test` or `dev`.

    Returns:
        (list): A list of length equal to the number of sentences, where each
        element is a dict containing two lists: one with the words and
        one with the corresponding tags.
    """
    path = os.path.join(path, '%s%s.conllu' % (filenames, split))
    data_file = open(path, "r", encoding="utf-8")

    sentences = []
    for tokenlist in parse_incr(data_file):
        words = []
        tags = []
        for t in list(tokenlist):
            words.append(t['form'])
            tags.append(t['upos'])
        sentences.append({'words': words, 'udtag': tags})

    data_file.close()

    return sentences
 def extact(test_file):
     all_sent = []
     data_test = open(test_file, "r", encoding="utf-8")
     for tokenlist in parse_incr(data_test):
         all_sent.append(tuple(dict(tokenlist.metadata).values())[1])
     print('Test sentences are extracted')
     return all_sent
Example #18
0
def get_umbc_dict():
    dict_filen = os.path.join(proj_dir, 'umbc_freq.pkl')
    if os.path.exists(dict_filen):
        return pickle.load(open(dict_filen, mode='rb'))
    umbc_dir = '/mnt/store/home/makrai/data/language/english/corp/umbc_WebBase/English/'
    freq = defaultdict(int)
    for filen in glob.glob(
            os.path.join(umbc_dir, 'en-common_crawl-*.conllu.xz')):
        logging.info(filen)
        for i, sentence in enumerate(
                parse_incr(
                    lzma.open(os.path.join(umbc_dir, filen),
                              mode='rt',
                              encoding="utf-8"))):
            if not i % 100000:
                logging.debug(i)
            root = sentence.to_tree()
            subj, obj = '', ''
            for child in root.children:
                if 'subj' in child.token['deprel']:
                    if subj:
                        #logging.warn('subj: {}'.format((subj, child.token['lemma'], sentence)))
                        continue
                    subj = child.token['lemma']
                elif child.token['deprel'] == 'obj':
                    if obj:
                        #logging.warn('obj: {}'.format((obj, child.token['lemma'], sentence)))
                        continue
                    obj = child.token['lemma']
            #if bool(obj) and bool(subj):
            freq[(subj, root.token['lemma'], obj)] += 1
        pickle.dump(freq, open(dict_filen, mode='wb'))
    return freq
Example #19
0
def import_data(filepath):
    '''
    Imports the data from the specific .conllu file supplied.

    Parameters:
    filepath (str): Filepath to conllu file

    Returns:
    sentences (list<str>): A list of sentences
    sentence_tags (list<str>): A list of tags

    '''
    data_file = open(filepath, mode="r", encoding="utf8")
    tokenlist = list(parse_incr(data_file))

    tagged_sentences = []
    for sentence in tokenlist:
        tmp = []
        for token in sentence:
            tmp.append((token["form"], token["upos"]))

        tagged_sentences.append(tmp)

    sentences, sentence_tags = [], []

    for tagged_sentence in tagged_sentences:
        sentence, tags = zip(*tagged_sentence)
        sentences.append(np.array(sentence))
        sentence_tags.append(np.array(tags))

    return sentences, sentence_tags
def read_conllu_outputs(reference_path, hypothesis_path):
    translations = []
    with open(reference_path, 'r',
              encoding="utf-8") as ref, open(hypothesis_path,
                                             'r',
                                             encoding="utf-8") as hyp:
        for ref_tokenlist, hyp_tokenlist in zip(parse_incr(ref),
                                                parse_incr(hyp)):
            ref_sentence = Sentence()
            hyp_sentence = Sentence()
            for token in ref_tokenlist:
                ref_sentence.words.append(Word(token["form"], token["lemma"]))
            for token in hyp_tokenlist:
                hyp_sentence.words.append(Word(token["form"], token["lemma"]))
            translations.append(Translation(ref_sentence, hyp_sentence))
    return translations
def raw_text(input_file: str, output_file: str = None, gpu: bool = False, time: bool = True, memory: bool = True,
             ner: bool = True, model_name: str = "hu_core_news_lg"):
    nlp = load_pipeline(gpu, ner, model_name)
    if output_file:
        nlp.add_pipe("conll_formatter")

    data_file = open(input_file, "r", encoding="utf-8")
    sentences = list(parse_incr(data_file))

    texts = " ".join([s.metadata["text"] for s in sentences])

    if time:
        with Timer() as t:
            res = nlp(texts)
        print(f'Time spent: {t.elapsed:.2f} seconds')
    else:
        res = nlp(texts)

    if output_file:
        with open(output_file, 'w', encoding='utf-8') as writer:
            # noinspection PyProtectedMember
            print(rename_root(res._.conll_str), sep="\n", file=writer)

    if memory:
        print(f'Maximum memory usage: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MiB')
Example #22
0
def write_file_for_open_vs_closed_POS():
    """
    To extract sentences where open/closed class POS tags are grouped
    :return:
    """
    out_file_path = os.path.basename(
        path_to_conllu_file)[:-7] + '_open-closed.csv'

    with open('data/' + out_file_path, 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow([
            '{}|{} {}'.format('#' if i == 0 else '', i, t)
            for i, t in enumerate(['open', 'closed'])
        ])

        for sentence in parse_incr(
                open(path_to_conllu_file, "r", encoding="utf-8")):
            token_forms = []
            for token in sentence:
                if token["upostag"] in CONLLU_tags.open_class_tags:
                    token_forms.append('|{} {} |'.format(
                        0, token["form"].replace('|', '')))
                elif token["upostag"] in CONLLU_tags.closed_class_tags:
                    token_forms.append('|{} {} |'.format(
                        1, token["form"].replace('|', '')))
                else:
                    token_forms.append(token["form"])
            writer.writerow([' '.join(token_forms)])
Example #23
0
def write_file_for_nominal_core_args():
    """
    Extract just the 'nominal core args', i.e., subject and objects.
    :return:
    """
    out_file_path = os.path.basename(
        path_to_conllu_file)[:-7] + '_nom_args.csv'

    with open('data/' + out_file_path, 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow([
            '{}|{} {}'.format('#' if i == 0 else '', i, t)
            for i, t in enumerate(CONLLU_tags.nominal_core_arguments)
        ])

        for sentence in parse_incr(
                open(path_to_conllu_file, "r", encoding="utf-8")):
            token_forms = []
            for token in sentence:
                if token["deprel"] in CONLLU_tags.nominal_core_arguments:
                    index = CONLLU_tags.nominal_core_arguments.index(
                        token["deprel"])
                    token_forms.append('|{} {} |'.format(
                        index, token["form"].replace('|', '')))
                else:
                    token_forms.append(token["form"])
            writer.writerow([' '.join(token_forms)])
Example #24
0
def conllu2arr(direc):
    #Draws conllu files from direc and sorts them into the respective arrays
    dir = direc
    for root, dirs, files in os.walk(dir):
        for file in files:
            if ".conllu" in file:
                doc = []
                conllufile = file
                #print(conllufile)
                filepath = os.path.join(root, conllufile)
                datafile = open(filepath, "r", encoding="utf-8")
                for tokenlist in conllu.parse_incr(datafile):
                    for token in tokenlist:
                        if token["form"] not in punct:
                            doc.append(token["form"])
                #print(doc)
            if "kongzi" in file:
                kongzi.append(doc)
            elif "mengzi" in file:
                mengzi.append(doc)
            elif "liuxiang" in file:
                liuxiang.append(doc)
            elif "dongzhongshu" in file:
                dongzhongshu.append(doc)
            elif "zhuangzi" in file and "outer" not in file:
                zhuangzi.append(doc)
            elif "outer" in file:
                zhuangzi_test.append(doc)
Example #25
0
File: init.py Project: TimO96/NLP2
def parse_corpus(filename):

    data_file = open(filename, encoding="utf-8")

    ud_parses = list(parse_incr(data_file))
    
    return ud_parses
Example #26
0
def pearson_baseline(path):

    sentences = []

    for s in parse_incr(open(path, "r", encoding="utf-8")):
        sentences.append(s)

    baseline_left_score = []
    baseline_right_score = []
    gold_score = []

    for i, s in enumerate(sentences):
        arcs = tree_utils.conllu_to_arcs(s.to_tree())

        nodes = list(set([a[j] for j in [0, 1] for a in arcs]))
        nodes.sort()

        baseline_left = [(nodes[i], nodes[i - 1])
                         for i in range(1, len(nodes))]
        baseline_right = [(nodes[i - 1], nodes[i])
                          for i in range(1, len(nodes))]

        baseline_left_matrix = -tree_utils.arcs_to_distance_matrix(
            baseline_left)
        baseline_right_matrix = -tree_utils.arcs_to_distance_matrix(
            baseline_right)
        baseline_left_matrix_bidir = -tree_utils.arcs_to_distance_matrix(
            baseline_left, bidirectional=True)
        baseline_right_matrix_bidir = -tree_utils.arcs_to_distance_matrix(
            baseline_right, bidirectional=True)
        gold_matrix = -tree_utils.arcs_to_distance_matrix(arcs)
        gold_matrix_bidir = -tree_utils.arcs_to_distance_matrix(
            arcs, bidirectional=True)

        pearson_left = tree_utils.pearson_scores(baseline_left_matrix, s)
        pearson_left_bidir = tree_utils.pearson_scores(
            baseline_left_matrix_bidir, s)
        pearson_right = tree_utils.pearson_scores(baseline_right_matrix, s)
        pearson_right_bidir = tree_utils.pearson_scores(
            baseline_right_matrix_bidir, s)
        pearson_gold = tree_utils.pearson_scores(gold_matrix, s)
        pearson_gold_bidir = tree_utils.pearson_scores(gold_matrix_bidir, s)

        baseline_left_score.append(pearson_left + pearson_left_bidir)
        baseline_right_score.append(pearson_right + pearson_right_bidir)
        gold_score.append(pearson_gold + pearson_gold_bidir)

    print(
        "PEARSON BASELINES: (plain; irreflexive; bidirectional dep; bidir-irrefl) (same for bidirectional both sides)"
    )
    for label, scores in zip(
        ["LEFT", "RIGHT", "GOLD"],
        [baseline_left_score, baseline_right_score, gold_score]):
        print("  " + label)
        means = [np.nanmean(score) for score in zip(*scores)]
        print('   ' + '\n   '.join([
            '{} ({})'.format(a.round(2), b.round(2))
            for a, b in zip(means[::2], means[1::2])
        ]))
Example #27
0
def load_conllu(filepath):
    sentences = []
    print("loading sentences from {}".format(filepath))
    with open(filepath) as f:
        for s in tqdm(conllu.parse_incr(f)):
            if len(s) > 1:
                sentences.append(s)
    return sentences
Example #28
0
 def _read(self, file_path: str) -> Iterator[Instance]:
     """ Creates and iterator over instances from a file path """
     with open(file_path, 'r', encoding='utf-8') as f:
         for token_list in conllu.parse_incr(f):
             sentence = [token['form'] for token in token_list]
             pos_tags = [token['upostag'] for token in token_list]
             yield self.text_to_instance([Token(word) for word in sentence],
                                         pos_tags)
Example #29
0
def write_cleaned_version(output_filename, input_parsed_file,
                          whitelist_sent_id):
    outfile = open(output_filename, "w+", encoding="utf-8")
    with open(input_parsed_file, "r", encoding="utf-8") as infile:
        for sentence in conllu.parse_incr(infile):
            if sentence.metadata["sent_id"] in whitelist_sent_id:
                outfile.writelines(sentence.serialize())
    outfile.close()
def parse_corpus(filename: str) -> List[TokenList]:
    """
    Parses a file into a collection of TokenLists
    """
    data_file = open(filename, encoding="utf-8")
    ud_parses = list(parse_incr(data_file))

    return ud_parses