Esempio n. 1
0
def read_conll(treebank_path, langs, code_to_lang, train_or_dev, tgt_size=None, test=False):
  
  """
   Reads conll formatted file

   langs: list of languages
   train: read training data
   returns: dict with data for each language
   as list of tuples of sentences and morph-tags
  """

  annot_sents = {}
  unique = []
  for lang in langs:

    train = train_or_dev if not test else "test"

    if not test:
      filepath = treebank_path + "UD_" + code_to_lang[lang] + '/' + lang + "-ud-" + train + ".conllu"
    else:
      filepath = treebank_path + "UD_" + code_to_lang[lang] + '/' + lang + "-ud-test.conllu"

    with open(filepath) as f:
      data = f.readlines()[:-1]
      data = [line for line in data if line[0]!='#']
      split_data = " ".join(data).split("\n \n")
      ud = [parse(sent)[0] for sent in split_data]

      all_text = []
      all_tags = []
      if langs[-1]==lang and tgt_size:
        tgt_size = min(tgt_size, len(ud))
        ud = ud[:tgt_size]
      for sent in ud:
        sent_text = []
        sent_tags = []
        for word in sent:
          word_tags = {}
          if word['feats']:
            word_tags = dict(word['feats'])
          if word['upostag']:
            if word_tags:
              word_tags.update({'POS':word['upostag']})
            else:
              word_tags = {'POS':word['upostag']}
          
          if word_tags:
            word_tags = freeze_dict(word_tags)
            if word_tags not in unique:
              unique.append(word_tags)

          sent_text.append(word['form'])
          sent_tags.append(freeze_dict(word_tags))

        all_text.append(sent_text)
        all_tags.append(sent_tags)

      annot_sents[lang] = [(w, m) for w, m in zip(all_text, all_tags)]

  return annot_sents, unique
Esempio n. 2
0
    def parse(self, data):
        sentences_raw = UD_PARSER.parse(data)
        sentences_parsed = conllu_parser.parse(sentences_raw)
        sentences = self.to_json(sentences_parsed)

        return OrderedDict([
            ("sentences", sentences),
        ])
Esempio n. 3
0
 def load_conllu(self, filename):
     '''
     Загрузка файла в формате conllu и его парсинг.
     Для парсинга файлов с менее чем 10 колонками использовался этот парсер:
     https://github.com/svetlana21/conllu
     '''
     with open(filename, 'r', encoding='utf-8') as f:
         data = f.read()
     result = parse(data)
     return result
Esempio n. 4
0
def parse_sent(inf, outf, return_tree=True):

    # read configs and command line options
    config = configparser.ConfigParser()
    config.read('config.ini')
    in_fname, out_fname = inf, outf
    check_infile(in_fname)

    fname_clean = os.path.basename(in_fname).rsplit('.', 1)[0]

    # temporary files and folder
    tmp_path = get_path_from_config(config, 'TMP_PATH', 'tmp')
    tmp_fsuffixes = [
        '_mystem_in.txt', '_mystem_out.txt', '_treetagger_in.txt',
        '_treetagger_out.txt', '_raw.conll'
    ]
    a, b, c, d, e = (PurePosixPath(j) for j in tmp_path.split('/'))
    tmp_fnames = [
        str(a / b / c / d / e / (fname_clean + fsuffix))
        for fsuffix in tmp_fsuffixes
    ]

    # output file and folder
    out_path = get_path_from_config(config, 'OUT_PATH', 'out')
    a, b, c, d, e = (PurePosixPath(j) for j in out_path.split('/'))
    if out_fname is None:
        out_fname = str(a / b / c / d / e / (fname_clean + '.conll'))
    else:
        out_fname = str(a / b / c / d / e / out_fname)

    # create output and temp folder if needed
    for path in [tmp_path, out_path]:
        if not os.path.exists(path):
            os.makedirs(path)

    # rock'n'roll
    process(in_fname, out_fname, config['DEFAULT']['APP_ROOT'],
            config['mystem']['MYSTEM_PATH'], config['malt']['MALT_ROOT'],
            config['malt']['MALT_NAME'], config['malt']['MODEL_NAME'],
            config['dicts']['COMP_DICT_PATH'],
            config['treetagger']['TREETAGGER_BIN'],
            config['treetagger']['TREETAGGER_PAR'], *tmp_fnames)

    for fname in tmp_fnames:
        os.remove(fname)

    with open(out_fname, 'r', encoding='utf-8') as conll_file:
        conll_data = conll_file.read()
        conll_file.close()
        os.remove(out_fname)
    if return_tree:
        return parse_tree(conll_data)
    return parse(conll_data)
Esempio n. 5
0
    def load_from_conll_u_file(self, file_path):
        tokens = []
        with open(file_path) as input_file:
            data = input_file.read()
            paresed_data = parse(data)
            total_counter = 0
            total_span_counter = 0
            for sent_counter, sentence in enumerate(paresed_data):
                for token_counter, token in enumerate(sentence):

                    new_token = Token(total_counter,
                                      token['form'],
                                      total_span_counter,
                                      total_span_counter + len(token['form']))

                    total_span_counter += len(token['form']) + 1

                    if token_counter == 0:
                        new_token.add_a_label('SentenceBegin', str(sent_counter))

                    if token['lemma']:
                        new_token.add_a_label('lemma', token.get('lemma'))

                    if token['upostag']:
                        new_token.add_a_label('upostag', token.get('upostag'))

                    if token['xpostag']:
                        new_token.add_a_label('xpostag', token.get('xpostag'))

                    feats = token['feats']
                    if feats:
                        for feat in feats:
                            new_token.add_a_label('feat-' + feat, feats[feat])

                    if token['head']:
                        new_token.add_a_label('head', str(token.get('head')))

                    if token['deprel']:
                        new_token.add_a_label('deprel', str(token.get('deprel')))

                    if token['deps']:
                        new_token.add_a_label('deps', token.get('deps'))

                    misc = token['misc']
                    if misc:
                        for feat in misc:
                            new_token.add_a_label('misc-' + feat, misc[feat])

                    tokens.append(new_token)
                    total_counter += 1

        return tokens
Esempio n. 6
0
def read_conll(langs, code_to_lang, train_or_dev, tgt_size=None, test=False):
    """
   Reads conll formatted file
  """

    treebank_path = "/projects/tir2/users/cmalaviy/ud_exp/ud-treebanks-v2.0/"
    test_treebank_path = "/projects/tir2/users/cmalaviy/ud_exp/ud-test-v2.0-conll2017/input/conll17-ud-test-2017-05-09/"
    # treebank_path = "/projects/tir2/users/cmalaviy/ud_exp/ud-treebanks-conll2017/"

    annot_sents = {}

    for lang in langs:
        sent_text = []
        lemmas = []

        train = train_or_dev if not test else "test"

        if not test:
            filepath = treebank_path + "UD_" + code_to_lang[
                lang] + '/' + lang + "-ud-" + train + ".conllu"
        else:
            filepath = test_treebank_path + lang + "-udpipe.conllu"

        with open(filepath) as f:
            data = f.readlines()[:-1]
            data = [line for line in data if line[0] != '#']
            split_data = " ".join(data).split("\n \n")
            ud = [parse(sent)[0] for sent in split_data]

            if langs[-1] == lang and tgt_size:
                tgt_size = min(tgt_size, len(ud))
                ud = ud[:tgt_size]
            for sent in ud:
                for word in sent:
                    lemmas.append(word['lemma'] + "\n")
                    sent_text.append(word['form'] + "\n")
                    #lemmas.append(" ".join([w for w in word['lemma']]).encode('utf8') + "\n")
                    #sent_text.append(" ".join([w for w in word['form']]).encode('utf8') + "\n")

        with open("lemma-words/" + lang + "_words.txt", 'w') as f:
            f.writelines(sent_text)
        with open("lemma-words/" + lang + "_lemmas.txt", 'w') as f:
            f.writelines(lemmas)

        annot_sents[lang] = [(w, m) for w, m in zip(sent_text, lemmas)]

    return annot_sents
Esempio n. 7
0
    def __init__(self, file_path):
        if os.path.exists(self.tw_object_file_path) and os.path.exists(
                self.tw_object_file_path) and os.path.exists(
                    self.ttt_object_file_path):
            self.read_external_file()
        else:
            corpus_file = open(file_path, 'r')
            corpus_data = re.sub(r" +", r"\t", corpus_file.read())
            sentence_list = parse(corpus_data)

            i = 0
            for sentence in sentence_list:
                i = i + 1
                print 'start' + str(i)

                tag1 = Tag.NOTEXIST
                tag2 = Tag.NOTEXIST
                for token in sentence:

                    word = token['form']
                    tag_detector = Tag.NOTEXIST
                    tag = tag_detector.get_tag(token['upostag'])

                    if (i == 12541):
                        print word

                    self.update_word_dict(word)
                    self.update_tag_word_dict(str(tag) + '|' + word)

                    ttt_tag_key = str(tag) + '|' + str(tag1) + '|' + str(tag2)
                    tt_tag_key = str(tag1) + '|' + str(tag2)
                    self.update_ttt_dict(ttt_tag_key)
                    self.update_tt_dict(tt_tag_key)
                    self.update_t_dict(str(tag))

                    #update tag
                    tag2 = tag1
                    tag1 = tag
                print 'finish ' + str(i)

            self.count_tw_prob_dict()
            self.count_wt_prob_dict()
            self.count_ttt_prob_dict()
            self.write_external_file()
Esempio n. 8
0
def parse_sentence_conllu(conllu_raw_string):
    #list of POS access: parsed[id_sent][id_word][x] x=0-> form,x = 1=pos
    conllu_list_raw = conllu_raw_string.replace("\t", "   ").split("\n\n")
    conllu_list_final = []

    for clr in conllu_list_raw:
        if len(clr) > 2:
            clr_part = clr.split("\n")
            conllu_final = ""
            for cline in range(2, len(clr_part)):
                conllu_final += clr_part[cline] + "\n"
            parsed = parse(conllu_final)

            conllu_list_final.append([])
            for idx in range(0, len(parsed[0])):
                par = []
                par.append(parsed[0][idx]['form'])
                par.append(parsed[0][idx]['upostag'])
                conllu_list_final[-1].append(par)
    return conllu_list_final
Esempio n. 9
0
#!/usr/bin/python3
# main.py
import sys
from conllu.parser import parse
from graph import Graph

if len(sys.argv) != 2:
    raise Exception('Invalid Input')
data = parse(open(sys.argv[1], 'r').read())
g = Graph(len(data[0])+1)
for word in data[0]:
    g.add_edge(word['head'], word['id'])
if g.is_cyclic():
    print("Cycle found with the following path: {0}".format(g.rec_path))
else:
    print("No cycles found.")
Esempio n. 10
0
		candidate_tags = upostag_dict[form]
		tag = max(candidate_tags, key=candidate_tags.get)		
		# print('Candidate: %s => %s' % (candidate_tags, tag))
	else:
		# assume OOV word as noun
		tag = 'NOUN'
	return tag

# for consistency
# SEED = 7

# read training data from file
with open('id-ud-train.conllu', 'r') as f:
	raw_train_data = f.read()

full_train_data = parse(raw_train_data)
num_full_train_data = len(full_train_data)
print("Num full train data: %s" %num_full_train_data)

# create validation data by splitting training data
# the split: 90% training data : 10% validation data
# random.seed(SEED)
random.shuffle(full_train_data)

validation_data = full_train_data[0:559]
train_data = full_train_data[559:]

num_validation_data = len(validation_data)
print("Num validation data: %s" %num_validation_data)

num_train_data = len(train_data)
Esempio n. 11
0
 def test_parse_data4(self):
     self.assertEqual(parse(data4), data4_flat)
Esempio n. 12
0
 def test_parse_data3(self):
     self.assertEqual(parse(data3), data3_flat)
Esempio n. 13
0
 def test_parse_data2(self):
     self.assertEqual(parse(data2), data2_flat)
Esempio n. 14
0
 def test_parse_only_id_data1(self):
     ids = [parsed_line["id"] for parsed_line in parse(data1, fields=["id"])[0]]
     num_lines = len(data1.strip().split("\n"))
     self.assertEqual(ids, list(range(1, num_lines + 1)))
Esempio n. 15
0
 def test_parse_data1(self):
     self.assertEqual(parse(data1), data1_flat)
 def read_external_file(self, file_path):
     corpus_file = open(file_path, 'r')
     corpus_data = re.sub(r" +", r"\t", corpus_file.read())
     sentence_list = parse(corpus_data)
     return sentence_list
Esempio n. 17
0
from conllu.parser import parse
import sys
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
#import os
#os.chdir('Desktop\Informatika\Semester_7\IF4072_NLP\IF4072\Pos Tagging')

#data = open('id-ud-train.conllu', mode='r', encoding='utf-8').read() # For Python 3
data = open('id-ud-train.conllu', mode='r').read()
data_parsed = parse(data)

feature_data = []
target_data = []

for i in range(0, len(data_parsed)):
    for j in range(0, len(data_parsed[i])):
        # current word
        form = data_parsed[i][j].get('form')

        # word before and its POS tag
        word_before = ""
        postag_before = ""
        if (data_parsed[i][j].get('id') == 1):
            word_before = "Null"
            postag_before = "Null"
        else:
            word_before = data_parsed[i][j - 1].get('form')
            postag_before = data_parsed[i][j - 1].get('upostag')
Esempio n. 18
0
 def test_parse_data7(self):
     parse(data7)
Esempio n. 19
0
 def test_parse_data6(self):
     self.assertEqual(parse(data6), data6_flat)
Esempio n. 20
0
def conllu_to_tokens(conllu):
    """Extract tokens from ConLL-U."""

    for sentence in parse(conllu):
        for word in sentence:
            yield word
Esempio n. 21
0
 def test_parse(self):
     from tests.fixtures.data1_flat import data1_expected
     self.assertEqual(parse(data1), data1_expected)