コード例 #1
0
                    raise TitleInconsistencyError(
                        '# of title sentences more than 1: {}'.format(id_))
        except (IOError, TitleInconsistencyError, InvalidTitleError,
                EmptyFileError):
            logger.debug(traceback.format_exc())
            continue
        except:
            logger.error(traceback.format_exc())
            continue


if __name__ == '__main__':
    import sys
    from puls_util import get_doc_ids_from_file
    exclude_labels = set(['MX', 'AU', 'AN'])
    ids = get_doc_ids_from_file(sys.argv[1])

    malform_data_dir = '/cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format-capitalized/'
    okform_data_dir = '/cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format/'
    extractor = FeatureExtractor()
    start = int(sys.argv[2])

    try:
        end = int(sys.argv[3])
    except IndexError:
        end = None

    successful_ids = []
    for id_, l in printable_train_data(
            malform_data_dir,
            okform_data_dir,
コード例 #2
0
import os
from util import is_monocase
from puls_util import (get_doc_ids_from_file, separate_title_from_body)

import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def main(docids, directory):
    good_cnt = 0
    for i, id_ in enumerate(docids):
        if i % 1000 == 0:
            logger.info('{}/{}/{}'.format(good_cnt, i, len(docids)))

        path = os.path.join(directory, id_)
        titles, _ = separate_title_from_body(path + '.auxil', path + '.paf')
        tokens = [t['token'] for t in titles[0]['features']]
        if not is_monocase(tokens):
            print(id_)
            good_cnt += 1


if __name__ == '__main__':
    main(get_doc_ids_from_file('data/tmp/2015-08-18/trainable_doc_ids.txt'),
         '/cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format')
コード例 #3
0
                    raise TitleInconsistencyError(
                        '# of title sentences more than 1: {}'.format(id_)
                    )
        except (IOError, TitleInconsistencyError,
                InvalidTitleError, EmptyFileError):
            logger.debug(traceback.format_exc())
            continue
        except:
            logger.error(traceback.format_exc())
            continue

if __name__ == '__main__':
    import sys
    from puls_util import get_doc_ids_from_file
    exclude_labels = set(['MX', 'AU', 'AN'])
    ids = get_doc_ids_from_file(sys.argv[1])

    malform_data_dir = '/cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format-capitalized/'
    okform_data_dir = '/cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format/'
    extractor = FeatureExtractor()
    start = int(sys.argv[2])
    
    try:
        end = int(sys.argv[3])
    except IndexError:
        end = None

    successful_ids = []
    for id_, l in printable_train_data(malform_data_dir,
                                       okform_data_dir,
                                       ids,
コード例 #4
0
import os
from util import is_monocase
from puls_util import (get_doc_ids_from_file,
                       separate_title_from_body)


import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def main(docids, directory):
    good_cnt = 0
    for i, id_ in enumerate(docids):
        if i % 1000 == 0:
            logger.info('{}/{}/{}'.format(good_cnt, i, len(docids)))

        path = os.path.join(directory, id_)
        titles, _ = separate_title_from_body(path + '.auxil', path + '.paf')
        tokens = [t['token']
                  for t in titles[0]['features']]
        if not is_monocase(tokens):
            print(id_)
            good_cnt += 1
        
if __name__ == '__main__':
    main(get_doc_ids_from_file('data/tmp/2015-08-18/trainable_doc_ids.txt'),
         '/cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format')