def _create(self): file = open(self.input_path, 'rb') su = SerialUnpickler(file) unique_dict = create_dict(su) # unique_dict = collections.defaultdict(dict) # index = collections.defaultdict(int) # for paragraph in tqdm(su, total=85663, desc='Processing %s' % str(self.__class__.__name__)): # for sentence, sentence_orig in paragraph: # for sample in sentence: # for name, values in sample.features.items(): # if isinstance(values, str) or isinstance(values, numbers.Number): # values = [values] # # for value in values: # if value not in unique_dict[name]: # unique_dict[name][value] = index[name] # index[name] += 1 # # for sample in sentence_orig: # for name, values in sample.features.items(): # if isinstance(values, str) or isinstance(values, numbers.Number): # values = [values] # # for value in values: # if value not in unique_dict[name]: # unique_dict[name][value] = index[name] # index[name] += 1 file.close() file = open(self.output_path(), 'wb') pickle.dump(unique_dict, file) file.close()
def learn(self, path, stop=-1, start=0, ids=None): lemma_count = collections.defaultdict( lambda: collections.defaultdict(int)) if ids is None: ids = [] su = SerialUnpickler(open(path, 'rb'), stop=stop, start=start, ids=ids) for paragraph in su: for sentence, sentence_orig in paragraph: for sample in sentence_orig: # print(sample.features) if 'lemma' in sample.features: # some samples doesnt have lemma, because not on gold segmentation lemma_count[(sample.features['token'], sample.features['label'] )][sample.features['lemma']] += 1 # print(lemma_count[('Morawieckiego','subst:sg:gen:m1')]) # defaultdict(<class 'int'>, {'morawieckiego': 7, 'Morawiecki': 7, 'Morawieckiego': 1}) for k, v in lemma_count.items(): # try: # xxx = sorted(v.items(), key=lambda x: (x[1], x[0]), reverse=True) # if xxx[0][1]==xxx[1][1]: # print(k, xxx) # except: pass # if len(v)>1: print(k, sorted(v.items(), key=lambda x: (x[1], x[0]), reverse=True)) # TODO: lematyzacja w zależności od pozycji słowa w zdaniu - nie pierwsze słowo wtedy również wielka litera best = sorted(v.items(), key=lambda x: (x[1], x[0]), reverse=True)[0] # TODO kilka z taka sama statystyka self.lemmas[k] = best[0]
def _create(self): file = open(self.input_path, 'rb') su = SerialUnpickler(file) file2 = open(self.output_path(), 'wb') sp = SerialPickler(file2) paragraph: Paragraph for paragraph in tqdm(su, total=18484, desc='Processing %s' % str(self.__class__.__name__)): paragraph_sequence = [] for sentence, sentence_gold in paragraph: sequence = list(sentence) for sample in sentence: sample.features['tags4e3'] = create_token_features( sample.features['token'], sample.features['tags'], sample.features['space_before']) paragraph_sequence.append((sequence, sentence_gold)) sp.add(paragraph_sequence) file.close() file2.close()
def count_sentences(path, ids=None): if ids is None: ids = [] count = 0 su = SerialUnpickler(open(path, 'rb'), ids=ids) for paragraph in su: for sentence in paragraph: count += 1 return count
def _create(self): file = open(self.input_path, 'rb') su = SerialUnpickler(file) file2 = open(self.output_path(), 'wb') sp = SerialPickler(file2) paragraph: Paragraph for paragraph in tqdm(su, total=18484, desc='Processing %s' % str(self.__class__.__name__)): paragraph_sequence = preprocess_paragraph_preanalyzed(paragraph) sp.add(paragraph_sequence) file.close() file2.close()
def _create(self): file = open(self.input_path, 'rb') su = SerialUnpickler(file) file2 = open(self.output_path(), 'wb') sp = SerialPickler(file2) import jsonlines jf = jsonlines.open(self.output_path() + '.jsonl', mode='w') paragraph: Paragraph for paragraph in tqdm(su, total=18484, desc='Processing %s' % str(self.__class__.__name__)): paragraph_sequence = preprocess_paragraph_reanalyzed(paragraph) jf.write(serialize_sample_paragraph(paragraph_sequence)) sp.add(paragraph_sequence) file.close() file2.close()
import math from argparse import ArgumentParser from krnnt.serial_pickle import SerialPickler, SerialUnpickler, count_samples if __name__ == '__main__': parser = ArgumentParser(description='Split data') parser.add_argument('input_path', help='input path to data') parser.add_argument('output_path1', help='output path to data') parser.add_argument('output_path2', help='output path to data') parser.add_argument('ratio', type=float, help='ratio of data to write to the first output') args = parser.parse_args() num_data = count_samples(args.input_path) first_part = math.ceil(num_data * args.ratio) sp1 = SerialPickler(open(args.output_path1, 'wb')) sp2 = SerialPickler(open(args.output_path2, 'wb')) su = SerialUnpickler(open(args.input_path, 'rb')) for i, paragraph in enumerate(su): if i < first_part: sp1.add(paragraph) else: sp2.add(paragraph) sp1.close() sp2.close()
sentence2.append({ 'token': token.form, 'sep': token.space_before, 'tag': token.gold_form.tags, 'lemmas': [token.gold_form.lemma], }) except AttributeError: #omit sentence if some token does no have gold tag continue return paragraph2 if __name__ == '__main__': parser = ArgumentParser( description='Export data (before preprocessing) to format') parser.add_argument('input_path', help='input path to data') parser.add_argument('output_path', help='output path to data') parser.add_argument('-f', '--format', default='txt', help='output format') args = parser.parse_args() with open(args.input_path, 'rb') as file: su = SerialUnpickler(file) converter = get_output_converter(args.format) string = converter( (paragraph_to_result(paragraph_gold) for paragraph_gold in su)) with open(args.output_path, 'w') as output_file: output_file.write(string)
def generate_arrays_from_file(path, unique_features_dict, feature_name, label_name, stop=-1, start=0, ids=None, keep_unaligned=False, keep_infinity=True): if ids is None: ids = [] while 1: su = SerialUnpickler(open(path, 'rb'), stop=stop, start=start, ids=ids) for paragraph in su: for sentence, sentence_orig in paragraph: X_sentence = [] y_sentence = [] if not sentence: continue # TODO same_segmentation = len(sentence) == len( sentence_orig) and len([ sample for sample in sentence if 'label' in sample.features ]) if (not same_segmentation) and not keep_unaligned: continue if keep_unaligned and same_segmentation: for sample in sentence: X_sentence.append( np.array( k_hot(sample.features[feature_name], unique_features_dict[feature_name]))) if label_name == 'label': y_sentence.append( np.array( k_hot([sample.features[label_name]], unique_features_dict[label_name]))) else: y_sentence.append( np.array( k_hot(sample.features[label_name], unique_features_dict[label_name]))) else: for sample in sentence: X_sentence.append( np.array( k_hot(sample.features[feature_name], unique_features_dict[feature_name]))) for sample in sentence_orig: if label_name == 'label': y_sentence.append( np.array( k_hot([sample.features[label_name]], unique_features_dict[label_name]))) else: y_sentence.append( np.array( k_hot(sample.features[label_name], unique_features_dict[label_name]))) # print len(X_sentence), len(y_sentence) yield (X_sentence, y_sentence, sentence, sentence_orig) if not keep_infinity: break
Reanalyze corpus with Maca. E.g. prog train-gold.spickle train-reanalyzed.spickle """ if __name__ == '__main__': parser = ArgumentParser(usage=usage) parser.add_argument('file_path', type=str, help='paths to corpus') parser.add_argument('output_path', type=str, help='save path') parser.add_argument('--maca_config', default='morfeusz2-nkjp', help='Maca config') parser.add_argument('--toki_config_path', default='', help='Toki config path (directory)') args = parser.parse_args() file1 = open(args.file_path, 'rb') su_gold = SerialUnpickler(file1) file2 = open(args.output_path, 'wb') sp = SerialPickler(file2) maca_analyzer = MacaAnalyzer(args.maca_config) paragraph_gold: Paragraph for j, paragraph_gold in tqdm(enumerate(su_gold), total=18484, desc='Morphological analysis'): paragraph_raw = paragraph_gold.text() paragraph_reanalyzed = maca_analyzer.analyze(paragraph_raw) print('Number of sentences by Maca vs gold', len(paragraph_reanalyzed.sentences), len(paragraph_gold.sentences)) paragraph_reanalyzed = align_paragraphs(paragraph_reanalyzed, paragraph_gold)
if __name__ == '__main__': parser = ArgumentParser( description= 'Combines analyzed corpus with gold. Analyzed corpus must be with gold segmentation.' ) parser.add_argument('gold_path', help='') parser.add_argument('analyzed_path', help='') parser.add_argument('output_path', help='') args = parser.parse_args() file_path1 = args.gold_path file_path2 = args.analyzed_path output_path = args.output_path file1 = open(file_path1, 'rb') su_gold = SerialUnpickler(file1) file2 = open(file_path2, 'rb') su_analyzed = SerialUnpickler(file2) file3 = open(output_path, 'wb') sp = SerialPickler(file3) for paragraph_gold in su_gold: for sentence_gold in paragraph_gold: paragraph_analyzed = next(su_analyzed.__iter__()) assert len(paragraph_analyzed.sentences), 1 sentence_analyzed = paragraph_analyzed.sentences[0] assert len(sentence_analyzed.tokens), len(sentence_gold.tokens) for token_gold, token_analyzed in zip(sentence_gold, sentence_analyzed):