def get_all_stats(train_fn, dev_fn): """ # stats to retrieve self.lem2form_ratio = [] self.lengths = None self.branching_factor = None self.unique_feats = None self.oov_lemmas = None self.oov_forms = None self.oov_chars = None self.proper_nouns_count = 0 self.noise = None self.junk = None :param dev_fn: :return: """ train_raw = read_conll_data_file(train_fn) train_tokens = raw_data_to_tokens(train_raw) train_lemmas = [t.LEMMA.lower() for i in train_tokens for t in i] train_forms = [t.GOLD_FORM.lower() for i in train_tokens for t in i] # lem2form_ratio # title = 'Lemma-to-form ratio' plot_lem2form_ratio(train_tokens, fname='lem2form_ratio.pdf')
def extract_major_dict(train_conll_dir): lem2form_major_all = {} for filename in os.listdir(train_conll_dir): lang = filename.split('-')[0] conll_data_fname = os.path.join(train_conll_dir, filename) raw_data = ConllFileReader.read_file(conll_data_fname) tokenized_data = raw_data_to_tokens(raw_data) lem2form = {} for i in tokenized_data: for t in i: form = t.GOLD_FORM.lower() lemma = t.LEMMA.lower() lem2form.setdefault(lemma, []).append(form) lem2form_major = {} for l, forms in lem2form.items(): form_counter = Counter(forms) major_form = form_counter.most_common(1)[0][0] lem2form_major[l] = major_form lem2form_major_all[lang] = lem2form_major return lem2form_major_all
def training_setup(self, train_data_fname, dev_data_fname): assert train_data_fname is not None assert dev_data_fname is not None train_raw = read_conll_data_file(train_data_fname) dev_raw = read_conll_data_file(dev_data_fname) train_graphs = [dg_from_tokens(toks) for toks in raw_data_to_tokens(train_raw)] dev_graphs = [dg_from_tokens(toks) for toks in raw_data_to_tokens(dev_raw)] self.vocab.setup(vocab_path=self.vocab_fn, data=train_graphs, lower=self.lower, source='depgraphs') # Vectorize data self.train = self.vectorize_graphs(train_graphs) self.dev = self.vectorize_graphs(dev_graphs) # Set the number of extracted features (need for the NN) random_instance_x = self.dev[0, :-1] self.num_features = len(random_instance_x)
def extract_lemmas_forms_feature_dicts(raw_data): tokenized_data = raw_data_to_tokens(raw_data) lemmas, forms, feat_dicts = [], [], [] for instance in tokenized_data: for t in instance: forms.append(t.GOLD_FORM) lemmas.append(t.LEMMA) feats = BaseMorphData.extract_feature_dict(t) feat_dicts.append(feats) return lemmas, forms, feat_dicts
def main(): logging.basicConfig(level=logging.DEBUG) dev_conll_dir = os.path.abspath(sys.argv[1]) train_conll_dir = os.path.abspath(sys.argv[2]) output_dir = os.path.abspath(sys.argv[3]) logging.info('CoNLL files dev dir: %s', dev_conll_dir) logging.info('CoNLL files train dir: %s', train_conll_dir) hypotheses_dir = os.path.join(output_dir, 'hyp') references_dir = os.path.join(output_dir, 'ref') if not os.path.exists(hypotheses_dir): os.makedirs(hypotheses_dir) if not os.path.exists(references_dir): os.makedirs(references_dir) lem2form_major_all = extract_major_dict(train_conll_dir) for filename in os.listdir(dev_conll_dir): lang = filename.split('-')[0] conll_data_fname = os.path.join(dev_conll_dir, filename) ref_filename = os.path.join(references_dir, filename) out_filename = os.path.join(hypotheses_dir, filename) raw_data = ConllFileReader.read_file(conll_data_fname) tokenized_data = raw_data_to_tokens(raw_data) lem2form_major = lem2form_major_all[lang] with open(ref_filename, 'w') as reffh, open(out_filename, 'w') as hypfh: for idx, instance in enumerate(tokenized_data): for t in instance: form = t.GOLD_FORM lemma = t.LEMMA.lower() major_form = lem2form_major.get( lemma, lemma) # back off to lemma, if no form available reffh.write('%s\n' % form) hypfh.write('%s\n' % major_form) logging.info('Done')
def main(): logging.basicConfig(level=logging.DEBUG) dev_conll_dir = os.path.abspath(sys.argv[1]) logging.info('CoNLL files dir: %s', dev_conll_dir) output_dir = os.path.abspath(sys.argv[2]) hypotheses_dir = os.path.join(output_dir, 'hyp') references_dir = os.path.join(output_dir, 'ref') if not os.path.exists(hypotheses_dir): os.makedirs(hypotheses_dir) if not os.path.exists(references_dir): os.makedirs(references_dir) for filename in os.listdir(dev_conll_dir): conll_data_fname = os.path.join(dev_conll_dir, filename) ref_filename = os.path.join(references_dir, filename) out_filename = os.path.join(hypotheses_dir, filename) raw_data = ConllFileReader.read_file(conll_data_fname) tokenized_data = raw_data_to_tokens(raw_data) with open(ref_filename, 'w') as reffh, open(out_filename, 'w') as hypfh: for idx, instance in enumerate(tokenized_data): forms = [t.GOLD_FORM for t in instance] reffh.write('# sent_id = %d\n' % idx) reffh.write('# text = %s\n' % ' '.join(forms)) reffh.write('\n') random.shuffle(forms) hypfh.write('# sent_id = %d\n' % idx) hypfh.write('# text = %s\n' % ' '.join(forms)) hypfh.write('\n') logging.info('Done')
def main(): logging.basicConfig(level=logging.DEBUG) dev_conll_dir = os.path.abspath(sys.argv[1]) logging.info('CoNLL files dir: %s', dev_conll_dir) output_dir = os.path.abspath(sys.argv[2]) hypotheses_dir = os.path.join(output_dir, 'hyp') references_dir = os.path.join(output_dir, 'ref') if not os.path.exists(hypotheses_dir): os.makedirs(hypotheses_dir) if not os.path.exists(references_dir): os.makedirs(references_dir) for filename in os.listdir(dev_conll_dir): conll_data_fname = os.path.join(dev_conll_dir, filename) ref_filename = os.path.join(references_dir, filename) out_filename = os.path.join(hypotheses_dir, filename) raw_data = ConllFileReader.read_file(conll_data_fname) tokenized_data = raw_data_to_tokens(raw_data) with open(ref_filename, 'w') as reffh, open(out_filename, 'w') as hypfh: for idx, instance in enumerate(tokenized_data): for t in instance: form = t.GOLD_FORM lemma = t.LEMMA reffh.write('%s\n' % form) hypfh.write('%s\n' % lemma) logging.info('Done')