Exemple #1
0
    def test_parse_CoNLL2009_2(self):
        data = dedent("""\
            #\tid='1'-document_id='36:1047'-span='1'
            1\t+\t+\tPunc\tPunc\t_\t0\tROOT\t_\t_
            2\tIn\tin\tr\tr\tr|-|-|-|-|-|-|-|-\t5\tAuxP\t_\t_
            3\tDei\tDeus\tn\tPropn\tn|-|s|-|-|-|m|g|-\t4\tATR\t_\t_
            4\tnomine\tnomen\tn\tn\tn|-|s|-|-|-|n|b|-\t2\tADV\t_\t_
            5\tregnante\tregno\tt\tt\tt|-|s|p|p|a|m|b|-\t0\tADV\t_\t_

        """)

        sentences = parse(
            data,
            fields=(
                'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'
            ),
            field_parsers={
                "feats": lambda line, i: [feat for feat in line[i].split("|")]
            }
        )
        self.assertEqual(
            sentences[0][4],
            OrderedDict([
                ('id', 5),
                ('form', 'regnante'),
                ('lemma', 'regno'),
                ('upostag', 't'),
                ('xpostag', 't'),
                ('feats', ['t', '-', 's', 'p', 'p', 'a', 'm', 'b', '-']),
                ('head', 0),
                ('deprel', 'ADV'),
                ('deps', None),
                ('misc', None),
            ])
        )
        self.assertEqual(
            sentences[0].metadata,
            OrderedDict([
                ('id', "'1'-document_id='36:1047'-span='1'")
            ])
        )
def __parse_post_lev(files: List[str]):
    result_docs = []
    for fname in files:
        with open(fname, 'r') as content_file:
            content = content_file.read()
        chunks = conllu.parse(text=content, )
        assert len(chunks) == 1
        token_list, bio_list, tag_list = [], [], []
        for td in chunks[0]:
            bio_tags = td['xpostag'].split('-')
            tag = DataClass.ConllTag.NONE
            biostr = bio_tags[0]
            if biostr != 'O':
                if 'Fact' in bio_tags[1]:
                    tag = DataClass.ConllTag.Fact
                elif 'Testimony' in bio_tags[1]:
                    tag = DataClass.ConllTag.Testimony
                elif 'Value' in bio_tags[1]:
                    tag = DataClass.ConllTag.Value
                elif 'Rhetorical' in bio_tags[1]:
                    tag = DataClass.ConllTag.Rhetorical
                elif 'Policy' in bio_tags[1]:
                    tag = DataClass.ConllTag.Policy
                else:
                    assert False, 'Invalid proposition type: {}'.format(
                        bio_tags[1])
            assert biostr in ['B', 'I', 'O']
            token = td['form']
            token_list.append(token)
            bio_list.append(biostr)
            tag_list.append(tag)

        assert len(token_list) == len(bio_list) == len(tag_list)
        conll_doc = DataClass.ConllDocument(sentence_id=fname,
                                            token_list=token_list,
                                            bio_list=bio_list,
                                            tag_list=tag_list)
        result_docs.append(conll_doc)
        if len(result_docs) % 100 == 0:
            util.print_info('loaded chunks: {}'.format(len(result_docs)))
    return result_docs
Exemple #3
0
    def parse_tacred_json_to_ud(tacred_json_file_path: str, output_path: str, lang: str):
        nlp = SampleBARTAnnotator.get_ud_parser(lang)

        conllu_parse_list = []
        with open(tacred_json_file_path, 'r', encoding='utf-8') as input_file:
            tacred_json = json.load(input_file)
            for example_json in tqdm(tacred_json):
                doc = nlp([example_json["token"]])
                conllu_token_list = conllu.parse(doc._.conll_str)

                assert len(conllu_token_list) == 1
                conllu_token_list = conllu_token_list[0]

                relation = example_json['relation']
                tokens = [node["form"] for node in conllu_token_list]

                conllu_token_list.metadata["id"] = example_json['id']
                conllu_token_list.metadata["docid"] = example_json['docid']
                conllu_token_list.metadata["relation"] = relation
                conllu_token_list.metadata["token"] = json.dumps(tokens)
                # Adding +1 because this count start for 0, but the conllu token id is starting from 1
                conllu_token_list.metadata["subj_start"] = json.dumps(example_json['subj_start'] + 1)
                conllu_token_list.metadata["subj_end"] = json.dumps(example_json['subj_end'] + 1)
                conllu_token_list.metadata["obj_start"] = json.dumps(example_json['obj_start'] + 1)
                conllu_token_list.metadata["obj_end"] = json.dumps(example_json['obj_end'] + 1)
                conllu_token_list.metadata["subj_type"] = example_json['subj_type']
                conllu_token_list.metadata["obj_type"] = example_json['obj_type']

                trigger_tokens = search_triggers(example_json['subj_start'], example_json['subj_end'],
                                                 example_json['obj_start'], example_json['obj_end'],
                                                 relation, tokens, lang)

                # Sorting for ease of read
                trigger_tokens_sorted = sorted(trigger_tokens, key=lambda x: x[0])
                conllu_token_list.metadata["trigger_tokens"] = json.dumps(trigger_tokens_sorted)

                conllu_parse_list.append(conllu_token_list)

        with open(output_path, 'w', encoding='utf-8') as output_file:
            for conllu_token_list in conllu_parse_list:
                output_file.write(conllu_token_list.serialize())
Exemple #4
0
def generate_hypothesis_reference(input_dir=CONLLU_TEST_DIR):
    """
    This function return a reference and a hypothesis (list) by taking in a .conllu file.

    Args:
        input_dir: the position of the input .conllu file.

    Returns:
        `reference` contains the right for WER computing.
        `hypothesis` contains the raw Japanese that will be tokenized by the maxmatch and checked against the `reference`
    """
    source = conllu.parse(open(input_dir, "r").read())

    hypothesis = [(reduce(lambda x, y: x + y,
                          [token["form"] for token in tokenlist]))
                  for tokenlist in source]
    reference = [(reduce(lambda x, y: x + " " + y,
                         [token["form"] for token in tokenlist]))
                 for tokenlist in source]

    return reference, hypothesis
Exemple #5
0
def get_terms(conllu_data):
    """Get candidate terms from parsed text.
    
    Args:
        conllu_data (str): Parses in CoNLL-U format

    Returns:
        Dict of { term: weight } where term is normalized term text and
        weight an estimate of term relevance to data.
    """
    sentences = conllu.parse(conllu_data)

    # Naive example implementation: take lemma counts for nouns and return
    # counts normalized to [0,1] by dividing by max count.
    count = Counter()
    for sentence in sentences:
        for token in sentence:
            if token['upostag'] in ('NOUN', 'PROPN'):
                count[token['lemma']] += 1
    max_count = max(count.values())
    return {k: v / max_count for k, v in count.items()}
Exemple #6
0
 def testComplex2(self):
     content = [
         u'# sent_id = 2011Interviyu_Mariny_Astvatsaturyan.xml_11',
         u'# text = Тогда, как и сейчас, в качестве внештатного сотрудника.',
         u'0.1	_	_	_	_	_	_	_	0:exroot	_',
         u'1	Тогда	тогда	ADV	_	Degree=Pos	10	orphan	0.1:advmod	SpaceAfter=No',
         u'2	,	,	PUNCT	_	_	5	punct	5:punct	_',
         u'3	как	как	SCONJ	_	_	5	mark	5:mark	_',
         u'4	и	и	PART	_	_	5	advmod	5:advmod	_',
         u'5	сейчас	сейчас	ADV	_	Degree=Pos	1	advcl	1:advcl	SpaceAfter=No',
         u'6	,	,	PUNCT	_	_	5	punct	5:punct	_',
         u'7	в	в	ADP	_	_	10	case	10:case	_',
         u'8	качестве	качество	NOUN	_	Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing	7	fixed	7:fixed	_',
         u'9	внештатного	внештатный	ADJ	_	Case=Gen|Degree=Pos|Gender=Masc|Number=Sing	10	amod	10:amod	_',
         u'10	сотрудника	сотрудник	NOUN	_	Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing	0	root	0:root	SpaceAfter=No',
         u'11	.	.	PUNCT	_	_	10	punct	10:punct	_'
     ]
     sentences = parse('\n'.join(content))
     tokens = extract_text(sentences[0])
     expected = sentences[0].metadata['text']
     self.assertEqual(expected, ''.join(tokens))
def load_ud_corpus(ud_source_dir: str):
    """
    Extracts the underlying UD corpus data that is stored in conllu format.
    Returns a dictionary where the keys are the split and the values are dictionaries
    where the keys are the sentenceId
    """
    data_path = os.path.join(ud_source_dir, "UD_English-EWT-r1.2")

    sent_id_to_text = {}
    for split in ["train", "dev", "test"]:
        split_path = os.path.join(data_path, f"en-ud-{split}.conllu")
        log.info("Loading UD data from %s", split_path)
        with open(split_path) as fd:
            data = "".join(line for line in fd)
        data = conllu.parse(data)
        sent_count = 0
        for sent in data:
            sent_id_to_text[(split, sent_count)] = " ".join(
                [item["form"] for item in sent])
            sent_count += 1

    return sent_id_to_text
Exemple #8
0
 def score(self, sentence):
     sentence_relations = []
     if self.udpipe == False:
         doc = self.nlp(sentence)
         for token in doc:
             sentence_relations.append(str(token.dep_))
         BOR = self.count_vec.transform([' '.join(sentence_relations)
                                         ]).toarray().tolist()
         # BOR = BOR.tolist()
     else:
         sentences = self.udpipe_model.tokenize(sentence)
         for s in sentences:
             self.udpipe_model.tag(s)
             self.udpipe_model.parse(s)
         conllu_txt = self.udpipe_model.write(
             sentences, "conllu")  #conllu|horizontal|vertical
         conllu_obj = conllu.parse(conllu_txt)
         for item in conllu_obj[0]:
             sentence_relations.append(str(item['deprel']))
         BOR = self.count_vec.transform([' '.join(sentence_relations)
                                         ]).toarray().tolist()
     return BOR[0]
def get_tree(files, out):
    for file in files:
        with open(args.fp + file, "r", encoding="utf-8") as infile, open(
                out, "a", encoding="utf-8") as outfile:
            data = infile.read()
            items = [item for item in data.split("\n\n")]
            if len(items) > 5000:
                threshold = 5000
            else:
                threshold = len(items)
            for item in items[:threshold]:
                try:
                    parsed = parse(item)
                    item_postags = []
                    if parsed:
                        for item in parsed[0]:
                            item_postags.append(item["upostag"])
                        outfile.write(" ".join(item_postags) + ", " +
                                      file.split("-")[0] + "\n")
                except conllu.parser.ParseException:
                    print(file.split("-")[0])
                    print(item)
Exemple #10
0
    def testNormal(self):
        content = [
            u'# newdoc id = doc1',
            u'# newpar id = par1',
            u'# sent_id = 1',
            u'# text = Результаты \xa0(\xa0 нет',
            u'1	Результаты	_	_	_	_	_	_	_	SpacesAfter=\\s\\xa0',
            u'2	(	_	_	_	_	_	_	_	SpacesAfter=\\xa0\\s',
            u'3	нет	_	_	_	_	_	_	_	SpaceAfter=No',
            u'',
            u'',
        ]
        sentences = parse('\n'.join(content))
        sentences = split_sent(sentences[0])
        result = [extract_text(s, validate=False) for s in sentences]
        expected = [
            [u'Результаты', u' \xa0', u'(', u'\xa0 ', u'нет'],
        ]

        self.assertEqual(len(expected), len(result))
        for e, r in zip(expected, result):
            self.assertListEqual(e, r)
    def return_Sidorov(self, text, key):
        sentences = self.tokenize(text)

        # Then, we perform tagging and parsing for each sentence
        for s in sentences:
            self.tag(s)  # inplace tagging
            self.parse(s)  # inplace parsing

        conllu_txt = self.write(sentences,
                                "conllu")  # conllu|horizontal|vertical

        # print(conllu_txt)
        output = "FLAGSidorov" + key + " "
        conllu_obj = conllu.parse(conllu_txt)
        for i in range(0, len(sentences)):
            for item in conllu_obj[i]:
                if item["head"] is not None:
                    for item_loop in conllu_obj[i]:
                        if item_loop["id"] == item["head"]:
                            output += " " + item_loop[key] + item[key]

        return output.replace(".", "PUNCT").replace(":", "")
Exemple #12
0
    def score_verbs(self, sentence):
        sentences = self.udpipe_model.tokenize(sentence)
        for s in sentences:
            self.udpipe_model.tag(s)
            self.udpipe_model.parse(s)
        conllu_txt = self.udpipe_model.write(
            sentences, "conllu")  #conllu|horizontal|vertical
        conllu_obj = conllu.parse(conllu_txt)

        words_id = []
        relations = []
        for word in conllu_obj[0]:
            if str(word['upostag']).lower() == 'verb':
                words_id.append(word['id'])
        if len(words_id):
            for word in conllu_obj[0]:
                if word['head'] in words_id or word['id'] in words_id:
                    relations.append(word['deprel'])

        BOR = self.count_vec.transform([' '.join(relations)
                                        ]).toarray().tolist()
        return BOR[0]
Exemple #13
0
def generate_wordlist(input_dir, output_dir):
    """
    This function generate a wordlist in json format by taking in a .conllu file and extracting unique words from
    it.
    Args:
        input_dir: the position of the input .conllu file.
        output_dir: the position of the output wordlist.json.

    Returns:
        A list of strings, each string represents a token.
    """
    source = conllu.parse(open(input_dir, "r").read())

    wordlist = []

    for item in source:
        wordlist.extend([a["form"] for a in item])

    wordlist = list(set(wordlist))

    with open(output_dir, "w") as f:
        json.dump(wordlist, f)
def rule_main(args, data):
    file_name = 'rule_based/example.conllu'
    if args.dataset == 'CoQA':
        score = 0
        data = data[:5]
        length = len(data)
        with codecs.open(file_name, 'r', encoding='utf-8') as f:
            conllu_file = parse(f.read())

        # Creating dict
        ids = range(int(len(conllu_file) / 2))
        examples = {}
        count = 0
        for i, s in enumerate(conllu_file):
            if i % 2 == 0:
                examples[ids[count]] = s
            else:
                examples[str(ids[count]) + '_answer'] = s
                count += 1
        current_pos = 0
        for data_ in data:
            summary = data_['summary']
            generate_summary = ''
            for i in range(len(data_['questions'])):
                generate_summary += qa2d(current_pos, examples) + ' '
                current_pos += 1
            score += get_score(generate_summary, summary, args.score)

    elif args.dataset == 'QuAC':
        score = 0
        length = len(data)

    else:
        score = 0
        length = len(data)

    score = score/length
    print('summary score: ', score)
Exemple #15
0
    def test_parse_CoNLL2009(self):
        field_parsers = DEFAULT_FIELD_PARSERS.copy()
        field_parsers.update({
            "pfeats":
            lambda line, i: parse_dict_value(line[i]),
            "phead":
            lambda line, i: parse_int_value(line[i]),
            "apreds":
            lambda line, i: TestParseCoNLL2009.parse_apreds(line[i:len(line)]),
        })

        from tests.fixtures import TESTCASES_CONLL2009

        sentences = parse(
            TESTCASES_CONLL2009[0],
            fields=('id', 'form', 'lemma', 'plemma', 'pos', 'ppos', 'feats',
                    'pfeats', 'head', 'phead', 'deprel', 'pdeprel', 'fillpred',
                    'pred', 'apreds'),
            field_parsers=field_parsers,
        )
        self.assertEqual(
            sentences[0][2],
            OrderedDict([('id', 3), ('form', 'knihy'), ('lemma', 'kniha'),
                         ('plemma', 'kniha'), ('pos', 'N'), ('ppos', 'N'),
                         ('feats',
                          OrderedDict([('SubPOS', 'N'), ('Gen', 'F'),
                                       ('Num', 'S'), ('Cas', '2'),
                                       ('Neg', 'A')])),
                         ('pfeats',
                          OrderedDict([('SubPOS', 'N'), ('Gen', 'F'),
                                       ('Num', 'S'), ('Cas', '2'),
                                       ('Neg', 'A')])), ('head', 1),
                         ('phead', 1), ('deprel', 'Adv'), ('pdeprel', 'Adv'),
                         ('fillpred', 'Y'), ('pred', 'kniha'),
                         ('apreds', [
                             None, None, None, None, None, None, None, 'DIR1',
                             None, None, None, None, None, None, None, None
                         ])]))
Exemple #16
0
 def _generate_examples(self, data_dir, subarchive_path, files):
     counter = 0
     for path, f in files:
         if path == subarchive_path:
             stream = tarfile.open(fileobj=f, mode="r|*")
             for tarinfo in stream:
                 file_path = tarinfo.name
                 if file_path.startswith(data_dir) and file_path.endswith(
                         ".conllu"):
                     data = stream.extractfile(tarinfo).read().decode(
                         "utf-8")
                     for sent in conllu.parse(data):
                         res = {
                             "idx":
                             sent.metadata["sent_id"],
                             "text":
                             sent.metadata["text"],
                             "tokens":
                             [str(token["form"]) for token in sent],
                             "lemmas":
                             [str(token["lemma"]) for token in sent],
                             "pos_tags":
                             [str(token["upostag"]) for token in sent],
                             "xpos_tags":
                             [str(token["xpostag"]) for token in sent],
                             "feats":
                             [str(token["feats"]) for token in sent],
                             "head": [str(token["head"]) for token in sent],
                             "deprel":
                             [str(token["deprel"]) for token in sent],
                             "deps": [str(token["deps"]) for token in sent],
                             "misc": [str(token["misc"]) for token in sent],
                         }
                         yield counter, res
                         counter += 1
                 stream.members = []
             del stream
             break
    def return_deprelnegations(self, text):
        sentences = self.tokenize(text)

        # Then, we perform tagging and parsing for each sentence
        for s in sentences:
            self.tag(s)  # inplace tagging
            self.parse(s)  # inplace parsing

        conllu_txt = self.write(sentences,
                                "conllu")  # conllu|horizontal|vertical

        # print(conllu_txt)
        output = "FLAGdeprelnegations "
        conllu_obj = conllu.parse(conllu_txt)
        for i in range(0, len(sentences)):
            for item in conllu_obj[i]:

                if item['feats'] is not None:
                    for key, value in item['feats'].items():
                        if value == "Neg":
                            output += " " + item['deprel']

        return output
Exemple #18
0
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            sentences = parse(open(data_path, 'r').read())
            for sentence in sentences:
                for word in sentence:
                    form = word['form']
                    pos = word['upostag']
                    type = word['deprel']

                    real_word = form.split('_BERT_')[0]
                    for char in real_word:
                        char_alphabet.add(char)
                    form = DIGIT_RE.sub("0",
                                        form) if normalize_digits else form
                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if form not in vocab_set and (form in embedd_dict or
                                                  form.lower() in embedd_dict):
                        vocab_set.add(form)
                        vocab_list.append(form)
def case_spacy_german():

    data = """
# sent_id = testtext.1
1   Dies     Dies    PRON    PDS   _   2   sb     _   _
2   ist   sein  AUX    VAFIN   _                  0   ROOT    _   _
3   ein   einen  DET    ART   _                  4   nk    _   _
4   Test     Test    NOUN   NN   _                 2   pd   _   _
5   .     .    PUNCT   $.   _                 2   punct   _   _

"""

    result = conllu.parse(data)

    return (
        wikiannotator.Annotator.createAnnotator('spacy', {'model_name': 'de_core_news_sm'}),
        wikiannotator.SpacyAnnotator,
        {
            'text': 'Dies ist ein Test.',
            'textname': 'testtext',
            'parse': result
        }
    )
Exemple #20
0
def split(path, out_path=None, n_parts=5, rng=None):
    """Split a CoNLL-U file into parts."""
    if out_path is None:
        out_path = Path("output")
    if rng is None:
        rng = random.Random()

    out_path.mkdir()

    print(f"Reading {path}", file=sys.stderr)
    with open(path, encoding="utf-8") as f:
        sents = parse(f.read())
    rng.shuffle(sents)

    count = [0] * n_parts
    for i, sent in enumerate(tqdm(sents)):
        count[i % n_parts] += 1
        with open(out_path / f"{i % n_parts:02}.conllu", "a",
                  encoding="utf-8") as f:
            print(sent.serialize(), file=f, end="")

    for i, cnt in enumerate(count):
        print(f"Part {i:02}: {cnt}", file=sys.stderr)
def transduce(corpus, trans, control='normal', pos='upos'):
  """
  args:
    corpus: TextIOWrapper, the corpus to process
    trans: tbtk.TransitionSystemBase object
    control: str, format of the transition, backbone or normal
    pos: str, which pos column to choose, upos or xpos
  """
  sentences = [tbtk.ConllSent.from_conllu(sent) for sent in conllu.parse(corpus.read())]
  for s in sentences:
    transitions = []
    state = tbtk.State.init_from_sent(s)
    while not state.is_final():
      g = trans.gold_action(state)
      transitions.append(trans.action_to_str(g))
      state = trans.step(state, g)
    s.transitions = transitions
    print("form:", ' '.join(s.form))
    print("pos:", ' '.join(s.upos if pos == 'upos' else s.xpos))
    print("head:", ' '.join(str(x) for x in s.head))
    print("deprel:", ' '.join(s.deprel))
    print("transitions:", ' '.join(s.transitions))
    print()
Exemple #22
0
    def testComplex(self):
        content = [
            u'# newdoc id = doc1',
            u'# newpar id = par1',
            u'# sent_id = 1',
            u'# text = Результаты.Выводы',
            u'1	Результаты	_	_	_	_	_	_	_	SpaceAfter=No',
            u'2	.	_	_	_	_	_	_	_	SentenceBreak=Yes|SpaceAfter=No',
            u'3	Выводы	_	_	_	_	_	_	_	SpaceAfter=No',
            u'',
            u'',
        ]
        sentences = parse('\n'.join(content))
        sentences = split_sent(sentences[0])
        result = [extract_text(s, validate=False) for s in sentences]
        expected = [
            [u'Результаты', u'.'],
            [u'Выводы'],
        ]

        self.assertEqual(len(expected), len(result))
        for e, r in zip(expected, result):
            self.assertListEqual(e, r)
Exemple #23
0
def load_conllu(conllu_file):
    conllu_data = []
    with open(conllu_file, 'r', encoding='utf-8') as content_file:
        content = content_file.read()
        sentences = parse(content)
        for idx, sentence in enumerate(sentences):
            tokens, upos, head, deprel, offset = [], [], [], [], []
            reserved_offsets = []
            for widx, word in enumerate(sentence):
                if isinstance(word['id'], tuple):
                    # multi-word token, e.g., word['id'] = (4, '-', 5)
                    assert len(word['id']) == 3
                    indices = word['misc']['TokenRange'].split(':')
                    reserved_offsets.append([int(indices[0]), int(indices[1])])
                else:
                    tokens.append(word['form'])
                    upos.append(word['upostag'])
                    head.append(word['head'])
                    deprel.append(word['deprel'])
                    if word['misc'] is not None:
                        # single-word token
                        indices = word['misc']['TokenRange'].split(':')
                        offset.append([int(indices[0]), int(indices[1])])
                    elif len(reserved_offsets) > 0:
                        offset.append(reserved_offsets.pop())
                    else:
                        offset.append([-1, -1])

            assert len(tokens) == len(offset)
            sent_obj = OrderedDict([('id', sentence.metadata['sent_id']),
                                    ('text', sentence.metadata['text']),
                                    ('word', tokens), ('upos', upos),
                                    ('head', head), ('deprel', deprel),
                                    ('offset', offset)])
            conllu_data.append(sent_obj)

    return conllu_data
Exemple #24
0
def joinParse(data_file):
    data = data_file.read()
    sentences = parse(data)
    l = len(sentences)
    entireList = [[] for i in range(l)]
    indexes = []
    for i in range(l):
        sent = [(word['form'], word['upostag'], sentences[i].metadata)
                for word in sentences[i]]
        sent2 = [(word['form'], '<pad>', None) for word in sentences[i]]
        m = max(0, i - args.sngram // 2)
        M = min(l, i + args.sngram // 2 + args.sngram % 2)
        for j in range(m, M):
            if len(entireList[j]) == 0:
                entireList[j].append(('[CLS]', '<pad>', None))
            if i == j:
                indexes.append(
                    [len(entireList[j]),
                     len(entireList[j]) + len(sent)])
                entireList[j] += sent + [('[SEP]', '<pad>', None)]
            else:
                entireList[j] += sent2 + [('[SEP]', '<pad>', None)]

    return entireList, indexes
    def create_from_ud(cls, data_file_list):
        """Initialize corpus from a path to a file in conllu format"""
        corpus = POSCorpus()
        corpus.sentences = []

        for data_file_path in data_file_list:
            with open(data_file_path, "r", encoding="utf-8") as data_file:
                data = data_file.read()
                data = conllu.parse(data)

            for token_list in data:
                sentence = []
                for token in token_list:
                    pos = token['upostag']
                    lemma = token['lemma']
                    word = token['form']
                    # Sometimes the corpus doesn't have words, only underscores
                    if word == '_' or lemma == '_':
                        continue
                    sentence.append({'word': word, 'lemma': lemma, 'pos': pos})
                if len(sentence) > 0:
                    corpus.sentences.append(sentence)

        return corpus
Exemple #26
0
 def count_frequency(
     self,
     files,
 ):
     """
     Count frequencies for all lemmas in corpus
     :param files: list with paths to all conllu files in corpus
     :return: list with all types in corpus, sorted descending by frequency
     """
     list_of_words = []
     for file in files:
         with open(file, "r") as fin:
             data = fin.read()
             for sentence in parse(data):
                 for word in sentence:
                     list_of_words.append(word["lemma"])
         fin.close()
     counted = Counter(list_of_words)
     frequency_list = sorted(
         counted,
         key=counted.__getitem__,
         reverse=True,
     )
     return frequency_list
Exemple #27
0
def extract_sentences_from_conllu_to_csv(src_dir, name, dst_dir):
    """Extract sentences from Universal Dependency structures in .conllu file
    and store in .csv file.

    Parameters
    ----------
    src_dir : str
        directory of .conllu file
    name : str
        name of .conllu file excluding '-train.conllu' or '-test.conllu'
    mode : str
        type of conllu file, 'train', 'test' etc.
    dst_dir : str
        directory where .csv file should be saved
    """
    filename = src_dir + name + '.conllu'
    print('Processing ' + filename + ' ... ', end='', flush=True)

    raw_data = open(filename, "r", encoding="utf-8").read()
    ud_dataset = parse(raw_data)

    sentences = []
    for tokenlist in ud_dataset:
        sentence = []
        for token in tokenlist:
            word = token['form']
            sentence.append(word)
        sentences.append(sentence)

    filename = dst_dir + name + '.csv'
    with open(filename, mode="w", encoding="utf-8", newline='') as fp:
        csv_writer = csv.writer(fp, delimiter=' ')
        csv_writer.writerows(sentences)

    print('DONE ', end='')
    print((len(ud_dataset), len(sentences)))
Exemple #28
0
def load_conllu(conllu_file):
    conllu_data = dict()
    with open(conllu_file, 'r', encoding='utf-8') as content_file:
        content = content_file.read()
        sentences = parse(content)
        for idx, sentence in enumerate(sentences):
            tokens, upos, head, deprel = [], [], [], []
            for widx, word in enumerate(sentence):
                if isinstance(word['id'], tuple):
                    # multi-word token, e.g., word['id'] = (4, '-', 5)
                    assert len(word['id']) == 3
                else:
                    tokens.append(word['form'])
                    upos.append(word['upostag'])
                    head.append(word['head'])
                    deprel.append(word['deprel'])

            sent_obj = OrderedDict([('id', sentence.metadata['sent_id']),
                                    ('text', sentence.metadata['text']),
                                    ('word', tokens), ('upos', upos),
                                    ('head', head), ('deprel', deprel)])
            conllu_data[sent_obj['id']] = sent_obj

    return conllu_data
Exemple #29
0
    def load_as_conllu(self, predefined_splits: bool = False):
        """
        :param bool predefined_splits: Boolean
        :return: A single parsed conllu list
                or a list of train, dev, test split parsed conllu list
                depending on predefined_split
        """
        with open('{}/CDT_coref.conllu'.format(self.dataset_dir)) as f:
            conlist = conllu.parse(f.read(), fields=["id", "form", "lemma", "upos", 'xpos', 'feats', 'head', 'deprel','deps', 'misc', 'coref_id', 'coref_rel', 'doc_id', 'qid'])

        if predefined_splits==False:
            return conlist

        parts = [None, None, None] 
        sent_parts = [[], [], []]
        for i, part in enumerate(['train', 'dev', 'test']):
            with open('{}/CDT_{}_ids.json'.format(self.dataset_dir,part)) as f:
                parts[i] = json.load(f)
            for sentence in conlist:
                if sentence[0]["doc_id"] in parts[i]:
                    sent_parts[i].append(sentence)


        return sent_parts
def text_to_json(text, model, sep="|", parse_system="udpipe"):
    """

    Parameters
    ----------
    text: str
    model: Model
    sep: str
    parse_system: str
    
    Returns
    -------
    l_sentences: list
    """
    if parse_system == "spacy":
        import spacy

        # Need installed English lang: python -m spacy download en
        nlp = spacy.load('en')
        tokens = nlp(text)

        tmp_text = str(text)
        posStart_prev = 0
        l_sentences = []
        for sent in tokens.sents:
            l_sent = []
            for word in sent:
                posStart = str.find(tmp_text, word.text, posStart_prev)
                posStart_prev = posStart
                d_word = {
                    "id": word.orth,
                    "forma": word.text,
                    "lemma": word.lemma_,
                    "norm": word.norm_,
                    "pos": word.pos_,
                    'tag': word.tag_,
                    "grm": "",
                    "len": word.__len__(),
                    "posStart": posStart,
                    "dom": word.head.text,
                    "link": word["deprel"]
                }
                l_sent.append(d_word)
            l_sentences.append(l_sent)

    elif parse_system == "udpipe":
        segmented = ufal.udpipe.Pipeline(model.model, "tokenize",
                                         ufal.udpipe.Pipeline.NONE,
                                         ufal.udpipe.Pipeline.NONE,
                                         "").process(text)
        sentences = model.read(segmented, "conllu")
        for sent in sentences:
            model.tag(sent)
            model.parse(sent)
        res_conllu = model.write(sentences, "conllu")

        tmp_text = str(text)
        posStart_prev = 0
        l_sentences = []
        for sent in conllu.parse(res_conllu):
            l_sent = []
            for word in sent:
                posStart = str.find(tmp_text, word["form"], posStart_prev)
                posStart_prev = posStart
                d_word = {
                    "id": word["id"],
                    "forma": word["form"],
                    "lemma": word["lemma"],
                    "pos": word["upostag"],
                    "grm": get_feats_string(word["feats"], sep=sep),
                    "len": len(word["form"]),
                    "posStart": posStart,
                    "dom": word["head"],
                    "link": word["deprel"]
                }
                l_sent.append(d_word)
            l_sentences.append(l_sent)
    else:
        print("Error. Unsupported parsing system. Use 'conll' or 'spacy'.")

    return l_sentences