Esempio n. 1
0
def get_all_stats(train_fn, dev_fn):
    """
    # stats to retrieve
    self.lem2form_ratio = []
    self.lengths = None
    self.branching_factor = None
    self.unique_feats = None

    self.oov_lemmas = None
    self.oov_forms = None
    self.oov_chars = None

    self.proper_nouns_count = 0
    self.noise = None
    self.junk = None

    :param dev_fn:
    :return:
    """
    train_raw = read_conll_data_file(train_fn)
    train_tokens = raw_data_to_tokens(train_raw)

    train_lemmas = [t.LEMMA.lower() for i in train_tokens for t in i]
    train_forms = [t.GOLD_FORM.lower() for i in train_tokens for t in i]

    # lem2form_ratio
    # title = 'Lemma-to-form ratio'
    plot_lem2form_ratio(train_tokens, fname='lem2form_ratio.pdf')
Esempio n. 2
0
def extract_major_dict(train_conll_dir):

    lem2form_major_all = {}

    for filename in os.listdir(train_conll_dir):
        lang = filename.split('-')[0]

        conll_data_fname = os.path.join(train_conll_dir, filename)
        raw_data = ConllFileReader.read_file(conll_data_fname)
        tokenized_data = raw_data_to_tokens(raw_data)

        lem2form = {}
        for i in tokenized_data:
            for t in i:
                form = t.GOLD_FORM.lower()
                lemma = t.LEMMA.lower()

                lem2form.setdefault(lemma, []).append(form)

        lem2form_major = {}
        for l, forms in lem2form.items():
            form_counter = Counter(forms)
            major_form = form_counter.most_common(1)[0][0]
            lem2form_major[l] = major_form

        lem2form_major_all[lang] = lem2form_major

    return lem2form_major_all
Esempio n. 3
0
    def training_setup(self, train_data_fname, dev_data_fname):

        assert train_data_fname is not None
        assert dev_data_fname is not None

        train_raw = read_conll_data_file(train_data_fname)
        dev_raw = read_conll_data_file(dev_data_fname)

        train_graphs = [dg_from_tokens(toks) for toks in raw_data_to_tokens(train_raw)]
        dev_graphs = [dg_from_tokens(toks) for toks in raw_data_to_tokens(dev_raw)]

        self.vocab.setup(vocab_path=self.vocab_fn,
                         data=train_graphs,
                         lower=self.lower,
                         source='depgraphs')

        # Vectorize data
        self.train = self.vectorize_graphs(train_graphs)
        self.dev = self.vectorize_graphs(dev_graphs)

        # Set the number of extracted features (need for the NN)
        random_instance_x = self.dev[0, :-1]
        self.num_features = len(random_instance_x)
Esempio n. 4
0
    def extract_lemmas_forms_feature_dicts(raw_data):

        tokenized_data = raw_data_to_tokens(raw_data)

        lemmas, forms, feat_dicts = [], [], []

        for instance in tokenized_data:
            for t in instance:
                forms.append(t.GOLD_FORM)
                lemmas.append(t.LEMMA)
                feats = BaseMorphData.extract_feature_dict(t)
                feat_dicts.append(feats)

        return lemmas, forms, feat_dicts
Esempio n. 5
0
def main():
    logging.basicConfig(level=logging.DEBUG)

    dev_conll_dir = os.path.abspath(sys.argv[1])
    train_conll_dir = os.path.abspath(sys.argv[2])
    output_dir = os.path.abspath(sys.argv[3])

    logging.info('CoNLL files dev dir: %s', dev_conll_dir)
    logging.info('CoNLL files train dir: %s', train_conll_dir)

    hypotheses_dir = os.path.join(output_dir, 'hyp')
    references_dir = os.path.join(output_dir, 'ref')

    if not os.path.exists(hypotheses_dir):
        os.makedirs(hypotheses_dir)

    if not os.path.exists(references_dir):
        os.makedirs(references_dir)

    lem2form_major_all = extract_major_dict(train_conll_dir)

    for filename in os.listdir(dev_conll_dir):

        lang = filename.split('-')[0]

        conll_data_fname = os.path.join(dev_conll_dir, filename)
        ref_filename = os.path.join(references_dir, filename)
        out_filename = os.path.join(hypotheses_dir, filename)

        raw_data = ConllFileReader.read_file(conll_data_fname)
        tokenized_data = raw_data_to_tokens(raw_data)

        lem2form_major = lem2form_major_all[lang]

        with open(ref_filename, 'w') as reffh, open(out_filename,
                                                    'w') as hypfh:

            for idx, instance in enumerate(tokenized_data):
                for t in instance:
                    form = t.GOLD_FORM
                    lemma = t.LEMMA.lower()
                    major_form = lem2form_major.get(
                        lemma,
                        lemma)  # back off to lemma, if no form available

                    reffh.write('%s\n' % form)
                    hypfh.write('%s\n' % major_form)

    logging.info('Done')
Esempio n. 6
0
def main():
    logging.basicConfig(level=logging.DEBUG)

    dev_conll_dir = os.path.abspath(sys.argv[1])
    logging.info('CoNLL files dir: %s', dev_conll_dir)

    output_dir = os.path.abspath(sys.argv[2])
    hypotheses_dir = os.path.join(output_dir, 'hyp')
    references_dir = os.path.join(output_dir, 'ref')

    if not os.path.exists(hypotheses_dir):
        os.makedirs(hypotheses_dir)

    if not os.path.exists(references_dir):
        os.makedirs(references_dir)

    for filename in os.listdir(dev_conll_dir):

        conll_data_fname = os.path.join(dev_conll_dir, filename)
        ref_filename = os.path.join(references_dir, filename)
        out_filename = os.path.join(hypotheses_dir, filename)

        raw_data = ConllFileReader.read_file(conll_data_fname)
        tokenized_data = raw_data_to_tokens(raw_data)

        with open(ref_filename, 'w') as reffh, open(out_filename,
                                                    'w') as hypfh:

            for idx, instance in enumerate(tokenized_data):
                forms = [t.GOLD_FORM for t in instance]

                reffh.write('# sent_id = %d\n' % idx)
                reffh.write('# text = %s\n' % ' '.join(forms))
                reffh.write('\n')

                random.shuffle(forms)
                hypfh.write('# sent_id = %d\n' % idx)
                hypfh.write('# text = %s\n' % ' '.join(forms))
                hypfh.write('\n')

    logging.info('Done')
def main():
    logging.basicConfig(level=logging.DEBUG)

    dev_conll_dir = os.path.abspath(sys.argv[1])
    logging.info('CoNLL files dir: %s', dev_conll_dir)

    output_dir = os.path.abspath(sys.argv[2])
    hypotheses_dir = os.path.join(output_dir, 'hyp')
    references_dir = os.path.join(output_dir, 'ref')

    if not os.path.exists(hypotheses_dir):
        os.makedirs(hypotheses_dir)

    if not os.path.exists(references_dir):
        os.makedirs(references_dir)

    for filename in os.listdir(dev_conll_dir):

        conll_data_fname = os.path.join(dev_conll_dir, filename)
        ref_filename = os.path.join(references_dir, filename)
        out_filename = os.path.join(hypotheses_dir, filename)

        raw_data = ConllFileReader.read_file(conll_data_fname)
        tokenized_data = raw_data_to_tokens(raw_data)

        with open(ref_filename, 'w') as reffh, open(out_filename, 'w') as hypfh:

            for idx, instance in enumerate(tokenized_data):
                for t in instance:
                    form = t.GOLD_FORM
                    lemma = t.LEMMA

                    reffh.write('%s\n' % form)
                    hypfh.write('%s\n' % lemma)

    logging.info('Done')