def main(docids, directory): good_cnt = 0 for i, id_ in enumerate(docids): if i % 1000 == 0: logger.info('{}/{}/{}'.format(good_cnt, i, len(docids))) path = os.path.join(directory, id_) titles, _ = separate_title_from_body(path + '.auxil', path + '.paf') tokens = [t['token'] for t in titles[0]['features']] if not is_monocase(tokens): print(id_) good_cnt += 1
def printable_train_data(malform_data_dir, okform_data_dir, ids, extractor, feature_names, start, end=None, title_transform_func=make_capitalized_title, exclude_labels=None, exclude_word_positions=set([0])): """ Adapted to PULS requirement: - auxil file is read to get the additional prepreocessed features Parameters ------------ malform_data_dir: string the directory where the malformed data reside okform_data_dir: string the directory where the correctly formed data reside ids: list of string document ids extractor: FeatureExtractor the feature extractor feature_names: list of string the feature names start, end: int how many titles to extract title_transform_func: function funtion that accepts the title and transforms it into some badly capitalized version exclude_labels: iterable of str labels that we don't consider Returns ------------ Generator of str: each str is one sentence, each line in the str is one token in the sent """ feature_names += ['y'] # add the label feature name malform_data_dir = Path(malform_data_dir) # take care of this ["tickerSymbol",["NYSE","SKT"]] # /cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format-capitalized/3987E0BD03749C996A04B881079AD753.auxil clean_tag = (lambda t: t[0] if isinstance(t, list) else t) get_tokens = partial(map, partial(get_in, ['token'])) get_tags = partial(map, compose(clean_tag, partial(get_in, ['pos']))) get_lemmas = partial(map, partial(get_in, ['lemma'])) n_collected = 0 for i, id_ in enumerate(ids): if i < start: continue if i % 1000 == 0: logger.info("Collected %d" % n_collected) logger.info("Finished %d" % i) if end is not None and i >= end: logger.info("Reached %d. Terminate." % end) break try: malform_auxil_path = (malform_data_dir / Path(id_)).with_suffix('.auxil') with malform_auxil_path.open(encoding='utf8') as f: logger.debug('processing: {}'.format(id_)) # to get the last line lines = f.readlines() if len(lines) == 0: raise EmptyFileError( 'auxil file empty: {}'.format(malform_auxil_path)) l = lines[-1] data = json.loads(l.strip()) okform_auxil_path = str( (okform_data_dir / Path(id_)).with_suffix('.auxil')) okform_paf_path = str( (okform_data_dir / Path(id_)).with_suffix('.paf')) good_title_sents, body_sents = separate_title_from_body( okform_auxil_path, okform_paf_path) # extract the tokens doc = [[t['token'] for t in sent['features']] for sent in body_sents] good_title_sents = list(good_title_sents) bad_title_sents = data['sents'] if not isinstance(bad_title_sents, list): raise InvalidTitleError( 'bad_title_sents not a list: {}'.format( bad_title_sents)) # we only consider headline that contains only ONE sentence if (len(good_title_sents) == 1 and len(bad_title_sents) == 1): good_sent = good_title_sents[0] bad_sent = bad_title_sents[0] good_title_tokens = get_tokens(good_sent['features']) bad_title_tokens = get_tokens(bad_sent['features']) # some validity checking if len(good_title_tokens) != len(bad_title_tokens): raise TitleInconsistencyError('{}\n{}'.format( good_title_tokens, bad_title_tokens)) good_title_tokens_lower = map(lambda s: s.lower(), good_title_tokens) bad_title_tokens_lower = map(lambda s: s.lower(), bad_title_tokens) if (good_title_tokens_lower != bad_title_tokens_lower): raise TitleInconsistencyError('{}\n{}'.format( good_title_tokens_lower, bad_title_tokens_lower)) tags = get_tags(bad_sent['features']) lemmas = get_lemmas(bad_sent['features']) # tag validity checking for tag in tags: if not (tag is None or isinstance(tag, basestring)): raise InvalidTitleError( '{}: tag {} not string'.format(id_, tag)) # get malformed title tokens words = convert_to_trainable_format(good_title_tokens, title_transform_func, extractor, doc=doc, pos=tags, lemma=lemmas) # format the features in the required form res = unicode() for i, word in enumerate(words): if (i not in exclude_word_positions and exclude_labels and word['y'] not in exclude_labels): word_feature_str = u'\t'.join([ unicode(word[feature_name]) for feature_name in feature_names ]) res += word_feature_str + '\n' n_collected += 1 yield id_, res else: raise TitleInconsistencyError( '# of title sentences more than 1: {}'.format(id_)) except (IOError, TitleInconsistencyError, InvalidTitleError, EmptyFileError): logger.debug(traceback.format_exc()) continue except: logger.error(traceback.format_exc()) continue
def printable_train_data(malform_data_dir, okform_data_dir, ids, extractor, feature_names, start, end=None, title_transform_func=make_capitalized_title, exclude_labels=None, exclude_word_positions=set([0])): """ Adapted to PULS requirement: - auxil file is read to get the additional prepreocessed features Parameters ------------ malform_data_dir: string the directory where the malformed data reside okform_data_dir: string the directory where the correctly formed data reside ids: list of string document ids extractor: FeatureExtractor the feature extractor feature_names: list of string the feature names start, end: int how many titles to extract title_transform_func: function funtion that accepts the title and transforms it into some badly capitalized version exclude_labels: iterable of str labels that we don't consider Returns ------------ Generator of str: each str is one sentence, each line in the str is one token in the sent """ feature_names += ['y'] # add the label feature name malform_data_dir = Path(malform_data_dir) # take care of this ["tickerSymbol",["NYSE","SKT"]] # /cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format-capitalized/3987E0BD03749C996A04B881079AD753.auxil clean_tag = (lambda t: t[0] if isinstance(t, list) else t) get_tokens = partial(map, partial(get_in, ['token'])) get_tags = partial(map, compose(clean_tag, partial(get_in, ['pos']))) get_lemmas = partial(map, partial(get_in, ['lemma'])) n_collected = 0 for i, id_ in enumerate(ids): if i < start: continue if i % 1000 == 0: logger.info("Collected %d" % n_collected) logger.info("Finished %d" % i) if end is not None and i >= end: logger.info("Reached %d. Terminate." % end) break try: malform_auxil_path = (malform_data_dir / Path(id_)).with_suffix('.auxil') with malform_auxil_path.open(encoding='utf8') as f: logger.debug('processing: {}'.format(id_)) # to get the last line lines = f.readlines() if len(lines) == 0: raise EmptyFileError('auxil file empty: {}'.format(malform_auxil_path)) l = lines[-1] data = json.loads(l.strip()) okform_auxil_path = str((okform_data_dir / Path(id_)).with_suffix('.auxil')) okform_paf_path = str((okform_data_dir / Path(id_)).with_suffix('.paf')) good_title_sents, body_sents = separate_title_from_body( okform_auxil_path, okform_paf_path ) # extract the tokens doc = [[t['token'] for t in sent['features']] for sent in body_sents] good_title_sents = list(good_title_sents) bad_title_sents = data['sents'] if not isinstance(bad_title_sents, list): raise InvalidTitleError( 'bad_title_sents not a list: {}'.format( bad_title_sents) ) # we only consider headline that contains only ONE sentence if (len(good_title_sents) == 1 and len(bad_title_sents) == 1): good_sent = good_title_sents[0] bad_sent = bad_title_sents[0] good_title_tokens = get_tokens(good_sent['features']) bad_title_tokens = get_tokens(bad_sent['features']) # some validity checking if len(good_title_tokens) != len(bad_title_tokens): raise TitleInconsistencyError('{}\n{}'.format( good_title_tokens, bad_title_tokens) ) good_title_tokens_lower = map(lambda s: s.lower(), good_title_tokens) bad_title_tokens_lower = map(lambda s: s.lower(), bad_title_tokens) if (good_title_tokens_lower != bad_title_tokens_lower): raise TitleInconsistencyError('{}\n{}'.format( good_title_tokens_lower, bad_title_tokens_lower) ) tags = get_tags(bad_sent['features']) lemmas = get_lemmas(bad_sent['features']) # tag validity checking for tag in tags: if not (tag is None or isinstance(tag, basestring)): raise InvalidTitleError( '{}: tag {} not string'.format(id_, tag) ) # get malformed title tokens words = convert_to_trainable_format( good_title_tokens, title_transform_func, extractor, doc=doc, pos=tags, lemma=lemmas ) # format the features in the required form res = unicode() for i, word in enumerate(words): if (i not in exclude_word_positions and exclude_labels and word['y'] not in exclude_labels): word_feature_str = u'\t'.join( [unicode(word[feature_name]) for feature_name in feature_names]) res += word_feature_str + '\n' n_collected += 1 yield id_, res else: raise TitleInconsistencyError( '# of title sentences more than 1: {}'.format(id_) ) except (IOError, TitleInconsistencyError, InvalidTitleError, EmptyFileError): logger.debug(traceback.format_exc()) continue except: logger.error(traceback.format_exc()) continue
def eval_rule_based(output_path, okform_dir, accepted_labels=set(['AL', 'IC']), print_errors=False): """ Return: numpy.ndarray: (#label, 3) count of #match, #mode, #ref for each label First word of sentence is ignored """ ret_stat = np.zeros((len(accepted_labels), 3), dtype=np.float64) n_finished = 0 n_errorless = 0 with Path(output_path).open('r', encoding='utf8') as prediction_file: while True: if n_finished % 1000 == 0: logger.info('Finished {}/{}'.format(n_errorless, n_finished)) line1 = prediction_file.readline() line2 = prediction_file.readline() if not line2: break try: id_ = line1.strip() pred_json = json.loads(line2.strip()) if pred_json['resultingHeadline'] is None: continue pred_tokens = pred_json['resultingHeadline'] auxil_path = str(Path(okform_dir) / Path(id_).with_suffix('.auxil')) paf_path = str(Path(okform_dir) / Path(id_).with_suffix('.paf')) title_sents, _ = separate_title_from_body(auxil_path, paf_path) true_tokens = [item['token'] for item in title_sents[0]['features']] if is_consistent_prediction(pred_tokens, true_tokens): stat = eval_stat(pred_tokens, true_tokens, accepted_labels) if print_errors: print_label_error(true_tokens, # we don't have features here features=None, instance_id=id_, excluded_indices=set([0]), correct_labels=map(get_label, true_tokens), predicted_labels=map(get_label, pred_tokens), target_true_label='IC', target_pred_label='AL', print_features=False) ret_stat += stat n_errorless += 1 else: logger.debug( 'Predicted and true tokens inconsisent:\n{}\n{}\n'.format( pred_tokens, true_tokens) ) except: logger.error(traceback.format_exc()) continue finally: n_finished += 1 return ret_stat
def eval_rule_based(output_path, okform_dir, accepted_labels=set(['AL', 'IC']), print_errors=False): """ Return: numpy.ndarray: (#label, 3) count of #match, #mode, #ref for each label First word of sentence is ignored """ ret_stat = np.zeros((len(accepted_labels), 3), dtype=np.float64) n_finished = 0 n_errorless = 0 with Path(output_path).open('r', encoding='utf8') as prediction_file: while True: if n_finished % 1000 == 0: logger.info('Finished {}/{}'.format(n_errorless, n_finished)) line1 = prediction_file.readline() line2 = prediction_file.readline() if not line2: break try: id_ = line1.strip() pred_json = json.loads(line2.strip()) if pred_json['resultingHeadline'] is None: continue pred_tokens = pred_json['resultingHeadline'] auxil_path = str( Path(okform_dir) / Path(id_).with_suffix('.auxil')) paf_path = str( Path(okform_dir) / Path(id_).with_suffix('.paf')) title_sents, _ = separate_title_from_body(auxil_path, paf_path) true_tokens = [ item['token'] for item in title_sents[0]['features'] ] if is_consistent_prediction(pred_tokens, true_tokens): stat = eval_stat(pred_tokens, true_tokens, accepted_labels) if print_errors: print_label_error( true_tokens, # we don't have features here features=None, instance_id=id_, excluded_indices=set([0]), correct_labels=map(get_label, true_tokens), predicted_labels=map(get_label, pred_tokens), target_true_label='IC', target_pred_label='AL', print_features=False) ret_stat += stat n_errorless += 1 else: logger.debug( 'Predicted and true tokens inconsisent:\n{}\n{}\n'. format(pred_tokens, true_tokens)) except: logger.error(traceback.format_exc()) continue finally: n_finished += 1 return ret_stat