Esempio n. 1
0
def separate_tiers(args):
    tiers = set(args.tiers)
    # assuming XML for now
    with open(args.infile, 'r') as instream:
        src_xc = xigtxml.load(instream)
        sep_xc = XigtCorpus(attributes=src_xc.attributes,
                            metadata=src_xc.metadata)
        for igt in src_xc.igts:
            sep_xc.add(
                Igt(id=igt.id,
                    type=igt.type,
                    attributes=igt.attributes,
                    metadata=igt.metadata,
                    tiers=[t for t in igt.tiers if t.type in tiers]))
        xigtxml.dump(open(args.outfile, 'w'), sep_xc)

    if not args.remainder: return
    with open(args.infile, 'r') as instream:
        src_xc = xigtxml.load(instream)
        rem_xc = XigtCorpus(attributes=src_xc.attributes,
                            metadata=src_xc.metadata)
        for igt in src_xc.igts:
            rem_xc.add(
                Igt(id=igt.id,
                    type=igt.type,
                    attributes=igt.attributes,
                    metadata=igt.metadata,
                    tiers=[t for t in igt.tiers if t.type not in tiers]))
        xigtxml.dump(open(args.remainder, 'w'), rem_xc)
Esempio n. 2
0
def run(args):
    if args.infiles:
        for fn in args.infiles:
            logging.info('Normalizing {}'.format(fn))
            xc = xigtxml.load(fn, mode='full')
            normalize_corpus(xc)
            xigtxml.dump(fn, xc)
    else:
        xc = xigtxml.load(sys.stdin, mode='full')
        normalize_corpus(xc)
        print(xigtxml.dumps(xc))
Esempio n. 3
0
def run(args):
    if args.infiles:
        for fn in args.infiles:
            logging.info('Cleaning {}'.format(fn))
            xc = xigtxml.load(fn, mode='full')
            clean_corpus(xc)
            xigtxml.dump(fn, xc)
    else:
        xc = xigtxml.load(sys.stdin, mode='full')
        clean_corpus(xc)
        print(xigtxml.dumps(xc))
Esempio n. 4
0
def xc_load(path, mode=FULL, do_basic_processing=False):
    f = open(path, 'r', encoding='utf-8')
    xc = xigtxml.load(f, mode=mode)
    if do_basic_processing:
        for inst in xc:
            basic_processing(inst)
    return xc
Esempio n. 5
0
def run(args):
    passed = []
    from xml.etree import ElementTree as ET
    ids = Counter()
    for i, f in enumerate(args.files):
        with open(f, 'r') as fh:
            try:
                xc = xigtxml.load(fh, mode='transient')
            except ET.ParseError:
                print('Corpus {} ({}) failed to load. First verify '
                      'that the XML file is valid by doing a schema '
                      'validation.'
                      .format(i, f))
            else:
                context = make_context(
                    xc, i, '<xigt-corpus>', 'collection', ids=ids
                )
                report = validate_corpus(xc, context)
                report = filter_empty_reports(
                    report, minlevel=logging.getLogger().getEffectiveLevel()
                )
                if report_is_empty(report):
                    passed.append(True)
                else:
                    passed.append(False)
                    print_report(report, args)
                add_id(ids, xc)
    return all(passed)
Esempio n. 6
0
def print_stats(args):
    def new_stats():
        return {
            'languages': set(),
            'iso-639-3': defaultdict(lambda: defaultdict(int)),
            'instances': 0,
            'igts': defaultdict(int),
            'tiers': defaultdict(int),
            'items': defaultdict(int),
        }

    stats = new_stats()
    lg_condition = lambda m: 'phrases' in m.attributes.get('tiers', '')
    num_files = 0
    for f in args.files:
        with open(f, 'r') as fh:
            num_files += 1
            cur_stats = new_stats()
            xc = xigtxml.load(fh, mode='transient')
            for igt in xc:
                stats['instances'] += 1
                cur_stats['instances'] += 1
                # language is in a meta element
                lgs = igt.get_meta('language', conditions=[lg_condition])
                if lgs:
                    lg_name = lgs[0].attributes.get('name', '???').strip()
                    lg_iso = lgs[0].attributes.get('iso-639-3', '???').strip()
                else:
                    lg_name = ''
                    lg_iso = ''
                stats['languages'].add(lg_name.lower())
                cur_stats['languages'].add(lg_name.lower())
                stats['iso-639-3'][lg_iso][lg_name] += 1
                cur_stats['iso-639-3'][lg_iso][lg_name] += 1
                # count tiers and items by types, IGTs by tier types
                all_tier_types = set()
                for tier in igt:
                    stats['tiers'][tier.type] += 1
                    cur_stats['tiers'][tier.type] += 1
                    all_tier_types.add(tier.type)
                    for item in tier:
                        stats['items'][item.type] += 1
                        cur_stats['items'][item.type] += 1
                stats['igts'][tuple(sorted(all_tier_types))] += 1
                cur_stats['igts'][tuple(sorted(all_tier_types))] += 1

        if args.summarize_each:
            print_summary('{} summary:'.format(f), cur_stats)
        if args.languages_each:
            print_languages('Languages used in {}:'.format(f),
                            cur_stats['iso-639-3'])
    if args.summarize:
        print_summary(
            'Overall summary ({} file{}):'.format(
                num_files, 's' if num_files != 1 else ''), stats)
    if args.languages:
        print_languages(
            'Languages used overall ({} file{}):'.format(
                num_files, 's' if num_files != 1 else ''), stats['iso-639-3'])
Esempio n. 7
0
def print_stats(args):
    def new_stats():
        return {
            'languages': set(),
            'iso-639-3': defaultdict(lambda: defaultdict(int)),
            'instances': 0,
            'igts': defaultdict(int),
            'tiers': defaultdict(int),
            'items': defaultdict(int),
        }
    stats = new_stats()
    lg_condition = lambda m: 'phrases' in m.attributes.get('tiers', '')
    num_files = 0
    for f in args.files:
        with open(f, 'r') as fh:
            num_files += 1
            cur_stats = new_stats()
            xc = xigtxml.load(fh, mode='transient')
            for igt in xc:
                stats['instances'] += 1
                cur_stats['instances'] += 1
                # language is in a meta element
                lgs = igt.get_meta('language', conditions=[lg_condition])
                if lgs:
                    lg_name = lgs[0].attributes.get('name', '???').strip()
                    lg_iso = lgs[0].attributes.get('iso-639-3', '???').strip()
                else:
                    lg_name = ''
                    lg_iso = ''
                stats['languages'].add(lg_name.lower())
                cur_stats['languages'].add(lg_name.lower())
                stats['iso-639-3'][lg_iso][lg_name] += 1
                cur_stats['iso-639-3'][lg_iso][lg_name] += 1
                # count tiers and items by types, IGTs by tier types
                all_tier_types = set()
                for tier in igt:
                    stats['tiers'][tier.type] += 1
                    cur_stats['tiers'][tier.type] += 1
                    all_tier_types.add(tier.type)
                    for item in tier:
                        stats['items'][item.type] += 1
                        cur_stats['items'][item.type] += 1
                stats['igts'][tuple(sorted(all_tier_types))] += 1
                cur_stats['igts'][tuple(sorted(all_tier_types))] += 1

        if args.summarize_each:
            print_summary('{} summary:'.format(f), cur_stats)
        if args.languages_each:
            print_languages('Languages used in {}:'.format(f),
                            cur_stats['iso-639-3'])
    if args.summarize:
        print_summary('Overall summary ({} file{}):'
                      .format(num_files, 's' if num_files != 1 else ''),
                      stats)
    if args.languages:
        print_languages('Languages used overall ({} file{}):'
                        .format(num_files, 's' if num_files != 1 else ''),
                        stats['iso-639-3'])
Esempio n. 8
0
def divide_corpus(args):
    infile = args.infile
    outdir = args.outdir
    igt_index = [0]  # just a list so I don't have to nonlocal it later
    indices = set()

    def make_filename(fn):
        return os.path.join(outdir, fn + '.xml')

    # this should make reading the corpus faster
    def selective_decode_igt(elem):
        idx = igt_index.pop()
        if idx not in indices:
            igt = None
        else:
            igt = xigtxml.default_decode_igt(elem)
            indices.remove(idx)
        igt_index.append(idx + 1)
        return igt

    if args.meta is not None:
        metatype, func = args.meta
        func = eval('lambda m:{}'.format(func))
        get_key = lambda igt: next(
            (func(m) for m in igt.get_meta(metatype, default=[]) if m is not None),
            None
        )

    # get a mapping of code to the indexed position of each IGT
    keymap = defaultdict(set)
    xc = xigtxml.load(open(infile, 'r'), mode='transient')
    for i, igt in enumerate(xc):
        key = get_key(igt)
        keymap[key].add(i)

    xigtxml.decode_igt = selective_decode_igt

    # now group IGTs with similar languages into a file
    for key, indices in keymap.items():
        if key is None:
            key = '-others-'  # FIXME not guaranteed to be unique
        igt_index = [0]
        xc = xigtxml.load(open(infile, 'r'), mode='transient')
        xigtxml.dump(open(make_filename(key), 'w'), xc)
Esempio n. 9
0
def divide_corpus(args):
    infile = args.infile
    outdir = args.outdir
    igt_index = [0]  # just a list so I don't have to nonlocal it later
    indices = set()

    def make_filename(fn):
        return os.path.join(outdir, fn + '.xml')

    # this should make reading the corpus faster
    def selective_decode_igt(elem):
        idx = igt_index.pop()
        if idx not in indices:
            igt = None
        else:
            igt = xigtxml.default_decode_igt(elem)
            indices.remove(idx)
        igt_index.append(idx + 1)
        return igt

    if args.meta is not None:
        metatype, func = args.meta
        func = eval('lambda m:{}'.format(func))
        get_key = lambda igt: next((func(m)
                                    for m in igt.get_meta(metatype, default=[])
                                    if m is not None), None)

    # get a mapping of code to the indexed position of each IGT
    keymap = defaultdict(set)
    xc = xigtxml.load(open(infile, 'r'), mode='transient')
    for i, igt in enumerate(xc):
        key = get_key(igt)
        keymap[key].add(i)

    xigtxml.decode_igt = selective_decode_igt

    # now group IGTs with similar languages into a file
    for key, indices in keymap.items():
        if key is None:
            key = '-others-'  # FIXME not guaranteed to be unique
        igt_index = [0]
        xc = xigtxml.load(open(infile, 'r'), mode='transient')
        xigtxml.dump(open(make_filename(key), 'w'), xc)
Esempio n. 10
0
def write(out_fn, fn_idx):
    xc = XigtCorpus()
    for fn, igt_indices in fn_idx.items():
        # if possible, try to decode needed igts only and skip the rest
        in_xc = xigtxml.load(fn, mode='transient')
        # ignoring corpus-level metadata
        xc.extend(igt for i, igt in enumerate(in_xc) if i in igt_indices)
    # assume the nsmap of the first igt is the same for all
    if xc.igts: xc.nsmap = xc[0].nsmap
    xigtxml.dump(out_fn, xc)
Esempio n. 11
0
def run(args):
    job = make_job(args)
    agenda = job['agenda']
    global_c = defaultdict
    for infile in args.infiles:
        filename = basename(infile) if args.basename else infile
        print(job['file_description'].format(filename=filename))
        xc = xigtxml.load(infile)
        results = process_agenda(xc, agenda)
        print_results(results)
        print()
Esempio n. 12
0
def run(infile, outpath, out_format, config=None):
    cfg = None
    if config:
        import json
        cfg = json.load(open(config,'r'))
    if out_format == 'latex':
        import xigt.exporters.latex as exporter
    elif out_format == 'itsdb':
        import xigt.exporters.itsdb as exporter
    # elif ...
    with open(infile, 'r') as in_fh:
        xc = xigtxml.load(in_fh, mode='transient')
        exporter.xigt_export(xc, outpath, config=cfg)
Esempio n. 13
0
def set_vectors(datasets):
    """
    Take loaded datasets pointing to XIGT files, load the IGT from the
    files and then send the glosses to vectors.

    Args:
        datasets: loaded dataset objects

    Returns: None

    """
    # Process and convert data
    for dataset in datasets:
        for iso in datasets[dataset]["iso_list"]:
            # Open the current xigt file
            xc = xigtxml.load(open(datasets[dataset]["iso_list"][iso]["xigt"]))
            for igt in xc:
                # Ignore lines without glosses
                if not igt.get('g'):
                    continue

                # Capture the translated words if a translation line exists
                try:
                    words = dict((w, True) for w in ' '.join(
                        [str(line.value()).lower()
                         for line in igt.get('t')]).split())
                except:
                    words = {}

                # Determine which glosses share a morpheme
                morphemes = {}
                for gloss in igt.get('g'):
                    if gloss.alignment not in morphemes:
                        morphemes[gloss.alignment] = []
                    morphemes[gloss.alignment] = (
                        morphemes.get(gloss.alignment, 0) +
                        [re.sub(' ', '',
                                str(gloss.value()).lower())])

                # Create a vector for each gloss instance
                for gloss in igt.get('g'):
                    if re.sub(PUNCTEX, '', gloss.value()):
                        word_match = True if re.sub(
                            PUNCTEX, '',
                            gloss.value()).lower() in words else False
                        shared = morphemes[
                            gloss.alignment] if gloss.alignment else ''
                        set_vector(dataset, iso,
                                   re.sub(PUNCTEX, '', gloss.value()), shared,
                                   word_match)
Esempio n. 14
0
def split_corpus(filelist, train=0, dev=0, test=0, prefix='', seed=None, overwrite=False, nfold=1):

    # TODO: Make it so we automatically get to one

    # -------------------------------------------
    # Check the arguments
    # -------------------------------------------
    split_sum = train + dev + test
    if split_sum != 1.0:
        SPLIT_LOG.critical('Sum of train({}) + dev({}) + test({}) should = 1, not {}'.format(train, dev, test, split_sum))
        raise CorpusSplitException()

    instances = []

    # -- 1) Load all the files
    for f in filelist:
        SPLIT_LOG.info("Loading file {}".format(f))
        xc = xigtxml.load(open(f, 'r', encoding='utf-8'))
        instances.extend(xc)

    # -------------------------------------------
    # Run the requested number of folds
    # -------------------------------------------
    offset = 0
    for fold in range(0, nfold):

        # -- 2) Shuffle with the specified seed if requested
        if seed is not None:
            r = random.Random()
            random.shuffle(instances, r.seed(seed))

        # -- 3) Move the files by the sliding offset if specified...
        offset_start = int(len(instances) * offset)


        instances = instances[offset_start:] + instances[:offset_start]

        # Actually split the instances
        train_instances, dev_instances, test_instances = split_instances(instances, train, dev, test)

        train_path = outpath_name(prefix, 'train', nfold, fold)
        dev_path   = outpath_name(prefix, 'dev', nfold, fold)
        test_path  = outpath_name(prefix, 'test', nfold, fold)

        # -- 6) Write out the output files.
        write_instances(train_instances, train_path, 'train', overwrite)
        write_instances(dev_instances, dev_path, 'dev', overwrite)
        write_instances(test_instances, test_path, 'test', overwrite)

        offset += (1 / nfold)
Esempio n. 15
0
def separate_tiers(args):
    tiers = set(args.tiers)
    # assuming XML for now
    with open(args.infile,'r') as instream:
        src_xc = xigtxml.load(instream)
        sep_xc = XigtCorpus(attributes=src_xc.attributes,
                            metadata=src_xc.metadata)
        for igt in src_xc.igts:
            sep_xc.add(Igt(id=igt.id, type=igt.type,
                           attributes=igt.attributes, metadata=igt.metadata,
                           tiers=[t for t in igt.tiers if t.type in tiers]))
        xigtxml.dump(open(args.outfile, 'w'), sep_xc)

    if not args.remainder: return
    with open(args.infile,'r') as instream:
        src_xc = xigtxml.load(instream)
        rem_xc = XigtCorpus(attributes=src_xc.attributes,
                            metadata=src_xc.metadata)
        for igt in src_xc.igts:
            rem_xc.add(Igt(id=igt.id, type=igt.type,
                           attributes=igt.attributes, metadata=igt.metadata,
                           tiers=[t for t in igt.tiers
                                  if t.type not in tiers]))
        xigtxml.dump(open(args.remainder, 'w'), rem_xc)
Esempio n. 16
0
def set_vectors(datasets):
    """
    Take loaded datasets pointing to XIGT files, load the IGT from the
    files and then send the glosses to vectors.

    Args:
        datasets: loaded dataset objects

    Returns: None

    """
    # Process and convert data
    for dataset in datasets:
        for iso in datasets[dataset]["iso_list"]:
            # Open the current xigt file
            xc = xigtxml.load(open(datasets[dataset]["iso_list"][iso]["xigt"]))
            for igt in xc:
                # Ignore lines without glosses
                if not igt.get('g'):
                    continue

                # Capture the translated words if a translation line exists
                try:
                    words = dict((w, True) for w in ' '.join([str(line.value()).lower() for line in igt.get('t')]
                                                             ).split())
                except:
                    words = {}

                # Determine which glosses share a morpheme
                morphemes = {}
                for gloss in igt.get('g'):
                    if gloss.alignment not in morphemes:
                        morphemes[gloss.alignment] = []
                    morphemes[gloss.alignment] = (morphemes.get(gloss.alignment, 0) +
                                                 [re.sub(' ', '', str(gloss.value()).lower())])

                # Create a vector for each gloss instance
                for gloss in igt.get('g'):
                    if re.sub(PUNCTEX, '', gloss.value()):
                        word_match = True if re.sub(PUNCTEX, '', gloss.value()).lower() in words else False
                        shared = morphemes[gloss.alignment] if gloss.alignment else ''
                        set_vector(dataset, iso, re.sub(PUNCTEX, '', gloss.value()), shared, word_match)
Esempio n. 17
0
def run(args):
    xc = xigtxml.load(args.infile)
    if args.igt_key:
        logging.info('Sorting %s IGTs' % args.infile)
        xc.sort(key=make_sortkey(args.igt_key))
    if args.tier_key:
        logging.info('Sorting %s tiers by key' % args.infile)
        for igt in xc:
            igt.sort(key=make_sortkey(args.tier_key))
    elif args.tier_deps:
        logging.info('Sorting %s tiers by ref-dependencies' % args.infile)
        refattrs = [ra.strip() for ra in args.tier_deps.split(',')]
        for igt in xc:
            igt.sort_tiers(refattrs=refattrs)
    if args.item_key:
        logging.info('Sorting %s items by key' % args.infile)
        for igt in xc:
            for tier in igt:
                tier.sort(key=make_sortkey(args.item_key))
    if args.in_place:
        xigtxml.dump(args.infile, xc)
    else:
        print(xigtxml.dumps(xc))
Esempio n. 18
0
 def setUp(self):
     xc = xigtxml.load(ger_file)
     self.inst = xc[0]
Esempio n. 19
0
    wals_svo_present = wals_svo.is_language_present(language_code)
    wals_sv_present = wals_sv.is_language_present(language_code)
    wals_ov_present = wals_ov.is_language_present(language_code)
    wals_past_tense_present = wals_past_tense.is_language_present(language_code)
    wals_future_tense_present = wals_future_tense.is_language_present(language_code)
    # check to see if we should bother loading the language
    if (
        (wals_svo_present and do_svo)
        or (wals_sv_present and do_sv)
        or (wals_ov_present and do_ov)
        or (wals_nadj_present and do_nadj)
        or (wals_past_tense_present and do_past_tense)
        or (wals_future_tense_present and do_future_tense)
    ):

        xc = xigtxml.load(language, mode="full")
        if wals_nadj_present and do_nadj:
            calc = NounAdjectiveProbe(xc, language_code, False, args.ndo)
            examine_language(calc, nadj_feature_dictionary, nadj_feature_num_instances_dictionary, nadj_errors)
        if wals_svo_present and do_svo:
            calc = SVOProbe(xc, language_code, False, args.ndo)
            examine_language(calc, svo_feature_dictionary, svo_feature_num_instances_dictionary, svo_errors)
        if wals_sv_present and do_sv:
            calc = SVProbe(xc, language_code, False, args.ndo)
            examine_language(calc, sv_feature_dictionary, sv_feature_num_instances_dictionary, sv_errors)
        if wals_ov_present and do_ov:
            calc = OVProbe(xc, language_code, False, args.ndo)
            examine_language(calc, ov_feature_dictionary, ov_feature_num_instances_dictionary, ov_errors)
        if wals_past_tense_present and do_past_tense:
            calc = PastTenseProbe(xc, language_code, False, args.ndo)
            examine_language(
Esempio n. 20
0
def index(fn, by, idx):
    xc = xigtxml.load(fn, mode='transient')
    for i, igt in enumerate(xc):
        idx_key = xp.find(igt, by)
        idx[idx_key][fn].add(i)
Esempio n. 21
0
        affixes.append(split[1])
print(affixes)
feature_dictionary={}
lang_count=0

for i in range(len(odin_corpus)):
    filename = os.path.basename(odin_corpus[i])
    language_code = os.path.splitext(filename)[0]
    try:
        # this is just a check to see if we get an error here.
        # we're going to error out if we can't look up this language/feature in WALS.
        wals_code = wals_dictionary.iso_to_wals[language_code]
        wals_value = wals.feature_dictionary[wals_code]
    except KeyError:  # it wasn't in the dictionary of languages which have reported stats for feature in WALS
        continue
    xc = xigtxml.load(odin_corpus[i], mode='transient')
    prefix_count=0
    suffix_count=0
    affix_count=0
    no_affix_count=0
    sentence_count=0
    hasmarker=False
    igt_list=[]
    for igt in xc:
        try:
            gloss=igt["g"]
            alignments=igt["a"]
            #glosspos=igt["gw-pos"]
        except:
            continue
        sentence_count+=1
Esempio n. 22
0
def do_projection(**kwargs):
    """
    (Re)project the
    :param aln_method: The alignment method
    """
    kwargs = ArgPasser(kwargs)
    aln_method = ALN_ARG_MAP[kwargs.get('aln_method', ARG_ALN_ANY)]

    successes = 0
    failures  = 0

    in_path = kwargs.get(ARG_INFILE)
    with open(in_path, 'r', encoding='utf-8') as f:
        PROJ_LOG.log(1000, 'Loading file "{}"...'.format(os.path.basename(in_path)))
        xc = xigtxml.load(f, mode=INCREMENTAL)
        for inst in xc:
            success_fail_string = 'Instance {:20s} {{:10s}}{{}}'.format('"'+inst.id+'"...')

            def fail(reason):
                nonlocal failures, success_fail_string
                success_fail_string = success_fail_string.format('FAIL', reason)
                failures += 1
            def success():
                nonlocal successes, success_fail_string
                success_fail_string = success_fail_string.format('SUCCESS', '')
                successes += 1

            # Query whether we want to require to use only trees
            # where the alignment is 100%.
            completeness_requirement = kwargs.get('completeness', default=0, t=float)

            # TODO: Find better way to do this?
            try:
                if kwargs.get('pos', True):
                    project_trans_pos_to_gloss(inst, aln_method=aln_method, completeness_requirement=completeness_requirement)
                    project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ)
                if kwargs.get('ds', True):
                    project_pt_tier(inst, proj_aln_method=aln_method)
                    project_ds_tier(inst, proj_aln_method=aln_method, completeness_requirement=completeness_requirement)
            except (NoNormLineException) as ntle:
                fail("Bad Lines")
            except (NoAlignmentProvidedError, ProjectionException) as nape:
                fail("Alignment")
            except (GlossLangAlignException) as glae:
                fail("Gloss-Lang")
            except (ProjectionIncompleteAlignment) as pia:
                fail("Alignment Incomplete")
            except PhraseStructureProjectionException as pspe:
                fail("Projection Failed")
            else:
                success()
            finally:
                PROJ_LOG.info(success_fail_string)
                inst.sort_tiers()

        out_path = kwargs.get(ARG_OUTFILE)
        # Try to make the folder if it doesn't already exist.
        os.makedirs(os.path.dirname(out_path), exist_ok=True)

        PROJ_LOG.log(1000, 'Writing new file "{}"...'.format(os.path.basename(out_path)))
        with open(out_path, 'w', encoding='utf-8') as out_f:
            xigtxml.dump(out_f, xc)

    PROJ_LOG.log(1000, '{} instances processed, {} successful, {} failed.'.format(len(xc), successes, failures))
Esempio n. 23
0
def enrich(**kwargs):

    global classifier

    if ARG_OUTFILE not in kwargs:
        ENRICH_LOG.critical("No output file specified.")
        sys.exit()

    # =============================================================================
    # Set up the alternate classifier path...
    # =============================================================================

    class_path = kwargs.get('class_path')

    #===========================================================================
    # Set up the different arguments...
    #===========================================================================
    inpath = kwargs.get(ARG_INFILE)

    parse_args = kwargs.get(PARSE_VAR, [])
    pos_args = kwargs.get(POS_VAR, [])
    aln_args = kwargs.get(ALN_VAR, [])

    max_parse_length = kwargs.get('max_parse_length', 10)

    if not (parse_args or pos_args or aln_args):
        ENRICH_LOG.warning("No enrichment specified. Basic processing only will be performed.")

    #===========================================================================
    # Sanity check the arguments.
    #===========================================================================

    # Check that alignment is asked for if projection is asked for.
    if (ARG_POS_PROJ in pos_args or ARG_PARSE_PROJ in parse_args) and (not aln_args):
        ENRICH_LOG.warn("You have asked for projection methods but have not requested " + \
                        "alignments to be generated. Projection may fail if alignment not already present in file.")

    ENRICH_LOG.log(1000, 'Loading input file...')
    with open(inpath, 'r', encoding='utf-8') as in_f:
        corp = xigtxml.load(in_f, mode=INCREMENTAL)

        # -------------------------------------------
        # Initialize the English tagger if:
        #   A) "proj" option is selected for pos.
        #   B) "trans" option is given for pos.
        #   C) "heurpos" option is given for alignment.
        # -------------------------------------------
        s = None
        if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, 'Initializing tagger...')
            tagger = c.getpath('stanford_tagger_trans')

            try:
                s = StanfordPOSTagger(tagger)
            except TaggerError as te:
                ENRICH_LOG.critical(te)
                sys.exit(2)

        # -------------------------------------------
        # Initialize the parser if:
        #    A) "trans" option is given for parse
        #    B) "proj" option is given for parse.
        # -------------------------------------------
        if ARG_PARSE_TRANS in parse_args or ARG_PARSE_PROJ in parse_args:
            ENRICH_LOG.log(1000, "Intializing English parser...")
            sp = stanford_parser.StanfordParser()

        # -------------------------------------------
        # Initialize the classifier if:
        #    A) "class" option is given for pos
        #    B) "heurpos" option is given for alignment.
        # -------------------------------------------
        m = None
        if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
            ENRICH_LOG.log(1000, "Initializing gloss-line classifier...")
            p = load_posdict()
            m = mallet_maxent.MalletMaxent(classifier)


        # -- 1b) Giza Gloss to Translation alignment --------------------------------------
        if ARG_ALN_GIZA in aln_args or ARG_ALN_GIZAHEUR in aln_args:
            ENRICH_LOG.log(1000, 'Aligning gloss and translation lines using mgiza++...')

            try:
                if ARG_ALN_GIZAHEUR in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=True, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
                if ARG_ALN_GIZA in aln_args:
                    giza_align_t_g(corp, resume=True, use_heur=False, symmetric=kwargs.get(ALN_SYM_VAR, SYMMETRIC_INTERSECT))
            except GizaAlignmentException as gae:
                gl = logging.getLogger('giza')
                gl.critical(str(gae))
                raise gae

        # -------------------------------------------
        # Begin iterating through the corpus
        # -------------------------------------------

        for inst in corp:

            feedback_string = 'Instance {:15s}: {{:20s}}{{}}'.format(inst.id)

            reasons = []
            inst_status = None

            def fail(reason):
                nonlocal inst_status, reasons
                if reason not in reasons:
                    reasons.append(reason)
                inst_status = 'WARN'

            def success():
                nonlocal inst_status
                inst_status = 'OK'

            # -------------------------------------------
            # Define the reasons for failure
            # -------------------------------------------
            F_GLOSS_LINE = "NOGLOSS"
            F_LANG_LINE  = "NOLANG"
            F_TRANS_LINE = "NOTRANS"
            F_BAD_LINES  = "BADLINES"
            F_L_G_ALN    = "L_G_ALIGN"
            F_T_G_ALN    = "G_T_ALIGN"
            F_NO_TRANS_POS="NO_POS_TRANS"
            F_PROJECTION = "PROJECTION"
            F_UNKNOWN    = "UNKNOWN"
            F_PARSELEN   = "OVER_MAX_LENGTH"


            try:

                # -------------------------------------------
                # Get the different lines
                # -------------------------------------------
                def tryline(func):
                    nonlocal inst
                    try:
                        return func(inst)
                    except NoNormLineException as nnle:
                        return None

                gl = tryline(gloss_line)
                tls = tryline(trans_lines)
                lls  = tryline(lang_lines)

                has_gl = gl is not None
                has_tl = tls is not None
                has_ll = lls is not None

                has_all = lambda: (has_gl and has_tl and has_ll)


                # -------------------------------------------
                # Translation Line
                # -------------------------------------------
                if has_tl:

                    if ARG_POS_PROJ in pos_args or ARG_POS_TRANS in pos_args or ARG_ALN_HEURPOS in aln_args:

                        try:
                            tag_trans_pos(inst, s)
                        except CriticalTaggerError as cte:
                            ENRICH_LOG.critical(str(cte))
                            sys.exit(2)

                    if ARG_PARSE_PROJ in parse_args or ARG_PARSE_TRANS in parse_args:
                        if len(trans(inst)) <= max_parse_length:
                            parse_translation_line(inst, sp, pt=True, dt=True)
                        else:
                            fail(F_PARSELEN)

                # 4) POS tag the gloss line --------------------------------------------
                if has_gl:
                    if ARG_POS_CLASS in pos_args or ARG_ALN_HEURPOS in aln_args:
                        classify_gloss_pos(inst, m, posdict=p)

                # -------------------------------------------
                # Try getting alignments.
                # -------------------------------------------
                if has_gl and has_ll:
                    try:
                        add_gloss_lang_alignments(inst)
                    except GlossLangAlignException as glae:
                        fail(F_L_G_ALN)

                if has_gl and has_tl:
                    if ARG_ALN_HEURPOS in aln_args:
                        heur_align_inst(inst, use_pos=True)
                    if ARG_ALN_HEUR in aln_args:
                        heur_align_inst(inst, use_pos=False)

                # -------------------------------------------
                # Now, do the necessary projection tasks.
                # -------------------------------------------

                # Project the classifier tags...
                if has_ll and has_gl and ARG_POS_CLASS in pos_args:
                    try:
                        project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_CLASS)
                    except GlossLangAlignException:
                        fail(F_L_G_ALN)

                # -------------------------------------------
                # Do the trans-to-lang projection...
                # -------------------------------------------

                if has_all():
                    proj_aln_method = ALN_ARG_MAP[kwargs.get('proj_aln', ARG_ALN_ANY)]
                    aln = get_trans_gloss_alignment(inst, aln_method=proj_aln_method)
                    if not aln or len(aln) == 0:
                        fail(F_T_G_ALN)
                    else:
                        # -------------------------------------------
                        # POS Projection
                        # -------------------------------------------
                        if ARG_POS_PROJ in pos_args:
                            trans_tags = trans_tag_tier(inst)

                            if not trans_tags:
                                fail(F_NO_TRANS_POS)
                            else:
                                project_trans_pos_to_gloss(inst)
                                try:
                                    project_gloss_pos_to_lang(inst, tag_method=INTENT_POS_PROJ)
                                except GlossLangAlignException as glae:
                                    fail(F_L_G_ALN)

                        # -------------------------------------------
                        # Parse projection
                        # -------------------------------------------
                        if ARG_PARSE_PROJ in parse_args:
                            try:
                                project_pt_tier(inst, proj_aln_method=proj_aln_method)
                            except PhraseStructureProjectionException as pspe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)

                            try:
                                project_ds_tier(inst, proj_aln_method=proj_aln_method)
                            except ProjectionException as pe:
                                fail(F_PROJECTION)
                            except NoAlignmentProvidedError as nape:
                                fail(F_T_G_ALN)



                # Sort the tiers... ----------------------------------------------------
                inst.sort_tiers()

            except Exception as e:
                # ENRICH_LOG.warn("Unknown Error occurred processing instance {}".format(inst.id))
                ENRICH_LOG.debug(e)
                # raise(e)
                fail(F_UNKNOWN)

            if not reasons:
                success()


            ENRICH_LOG.info(feedback_string.format(inst_status, ','.join(reasons)))

        ENRICH_LOG.log(1000, 'Writing output file...')

        if hasattr(kwargs.get(ARG_OUTFILE), 'write'):
            xigtxml.dump(kwargs.get(ARG_OUTFILE), corp)
        else:
            xigtxml.dump(writefile(kwargs.get(ARG_OUTFILE)), corp)

        ENRICH_LOG.log(1000, 'Done.')
        ENRICH_LOG.log(1000, "{} instances written.".format(len(corp)))
Esempio n. 24
0
 def setUp(self):
     my_path = os.path.join(testfile_dir, 'xigt/kor-ex.xml')
     self.my_igt = xigtxml.load(my_path)
Esempio n. 25
0
    wals_nadj_present = wals_nadj.is_language_present(language_code)
    wals_svo_present = wals_svo.is_language_present(language_code)
    wals_sv_present = wals_sv.is_language_present(language_code)
    wals_ov_present = wals_ov.is_language_present(language_code)
    wals_past_tense_present = wals_past_tense.is_language_present(language_code)
    wals_future_tense_present = wals_future_tense.is_language_present(language_code)
    # check to see if we should bother loading the language
    if (wals_svo_present and do_svo) \
            or (wals_sv_present and do_sv) \
            or (wals_ov_present and do_ov) \
            or (wals_nadj_present and do_nadj) \
            or (wals_past_tense_present and do_past_tense) \
            or (wals_future_tense_present and do_future_tense):

        xc = xigtxml.load(language, mode='full')
        if wals_nadj_present and do_nadj:
            calc = NounAdjectiveProbe(xc, language_code, False, args.ndo)
            examine_language(calc, nadj_feature_dictionary, nadj_feature_num_instances_dictionary, nadj_errors)
        if wals_svo_present and do_svo:
            calc = SVOProbe(xc, language_code, False, args.ndo)
            examine_language(calc, svo_feature_dictionary, svo_feature_num_instances_dictionary, svo_errors)
        if wals_sv_present and do_sv:
            calc = SVProbe(xc, language_code, False, args.ndo)
            examine_language(calc, sv_feature_dictionary, sv_feature_num_instances_dictionary, sv_errors)
        if wals_ov_present and do_ov:
            calc = OVProbe(xc, language_code, False, args.ndo)
            examine_language(calc, ov_feature_dictionary, ov_feature_num_instances_dictionary, ov_errors)
        if wals_past_tense_present and do_past_tense:
            calc = PastTenseProbe(xc, language_code, False, args.ndo)
            examine_language(calc, past_tense_feature_dictionary, past_tense_feature_num_instances_dictionary,
Esempio n. 26
0
negc=0
correct_position=0
incorrect_position=0
feature_dictionary={}

for i in range(len(odin_corpus)):
    filename = os.path.basename(odin_corpus[i])
    language_code = os.path.splitext(filename)[0]
    try:
        # this is just a check to see if we get an error here.
        # we're going to error out if we can't look up this language/feature in WALS.
        wals_code = wals_dictionary.iso_to_wals[language_code]
        wals_value = wals.feature_dictionary[wals_code]
    except KeyError:  # it wasn't in the dictionary of languages which have reported stats for feature in WALS
        continue
    xc = xigtxml.load(odin_corpus[i], mode='full')
    hasneg=False
    position={"VNeg":0,"NegV":0,"[V-Neg]":0,"[Neg-V]":0}
    number={"single":0,"double":0}

    result1=findwords(xc)
    hasword=result1[0]
    wordpos=result1[1]
    neglist=result1[2]
    wordnum=result1[3]
    position["NegV"]=wordpos["before"]#*(524/1059)
    position["VNeg"]=wordpos["after"]#*(171/1059)
    number["single"]+=wordnum["single"]
    number["double"]+=wordnum["double"]

    result2=findmorphs(xc)
Esempio n. 27
0
negc = 0
correct_position = 0
incorrect_position = 0
feature_dictionary = {}

for i in range(len(odin_corpus)):
    filename = os.path.basename(odin_corpus[i])
    language_code = os.path.splitext(filename)[0]
    try:
        # this is just a check to see if we get an error here.
        # we're going to error out if we can't look up this language/feature in WALS.
        wals_code = wals_dictionary.iso_to_wals[language_code]
        wals_value = wals.feature_dictionary[wals_code]
    except KeyError:  # it wasn't in the dictionary of languages which have reported stats for feature in WALS
        continue
    xc = xigtxml.load(odin_corpus[i], mode='full')
    hasneg = False
    position = {"VNeg": 0, "NegV": 0, "[V-Neg]": 0, "[Neg-V]": 0}
    number = {"single": 0, "double": 0}

    result1 = findwords(xc)
    hasword = result1[0]
    wordpos = result1[1]
    neglist = result1[2]
    wordnum = result1[3]
    position["NegV"] = wordpos["before"]  #*(524/1059)
    position["VNeg"] = wordpos["after"]  #*(171/1059)
    number["single"] += wordnum["single"]
    number["double"] += wordnum["double"]

    result2 = findmorphs(xc)
Esempio n. 28
0
def wordlist(filelist, gloss=None, meta=None):
    """
    This function takes a list of Xigt-XML ODIN files, looks for the
    'normalized' ODIN tier, and grabs the contents of all gloss and
    meta lines. It tokenizes simply by matching all word characters
    (using regex's `\w` escape) so as to pull out hyphenated and dotted
    gloss line tokens.

    The output is returned as a wordlist reverse sorted by count.

    :param filelist: List of input files to process.
    :type filelist: list[str]
    :param gloss: Path to use for the output gloss wordlist.
    :type gloss: str
    :param meta: Path to use for the output meta wordlist.
    :type meta: str
    """
    gloss_words = defaultdict(int)
    meta_words  = defaultdict(int)

    # -------------------------------------------
    # Iterate over all the paths in the list of files.
    # -------------------------------------------
    for path in filelist:
        with open(path, 'r', encoding='utf-8') as f:
            # Load the XigtCorpus, using the transient mode (most memory efficient)
            xc = xigtxml.load(f, mode='transient')

            # Now, iterate over each `Igt` instance in each file,
            for igt in xc:
                # Use a xigtpath expression to find the `tier` item that is a child of this node,
                # with state="normalized" as an attribute.
                norm_tier = xigtpath.find(igt, './tier[@state="normalized"]')

                # Next, since the `tag` attribute can be G+CR or M+AC etc., grab all lines
                # with a tag that starts with the desired tag letter.
                gloss_lines = [item for item in norm_tier if item.attributes['tag'].startswith("G")]
                meta_lines  = [item for item in norm_tier if item.attributes['tag'].startswith("M")]

                # Define a local function to update the wordlists for gloss and meta
                # lines.
                def update_count(l_l, words):
                    for l in l_l:
                        if l.value():
                            for w in l.value().split():
                                for sub_w in re.findall('[\w]+', w):  # <-- tokenize
                                    if sub_w.strip():
                                        words[sub_w.lower()] += 1 # <-- lowercase, and add

                # Update the counts.
                update_count(gloss_lines, gloss_words)
                update_count(meta_lines, meta_words)

    # Define a function to write out the wordlist objects to files.
    # here, we will reverse sort by frequency of the word, and
    # tab-delineate the columns.
    def write_items(words, path):
        if path:
            f = open(path, 'w', encoding='utf-8')
            items = sorted(words.items(), key=lambda x: (x[1], x[0]), reverse=True)
            for w, count in items:
                f.write('{}\t{}\n'.format(w, count))
            f.close()

    write_items(gloss_words, gloss)
    write_items(meta_words, meta)
Esempio n. 29
0
from unittest import TestCase
import os

my_dir = os.path.dirname(__file__)
seg_tests_path = os.path.join(my_dir, 'seg_tests.xml')

from xigt.codecs import xigtxml
from xigt import XigtCorpus, Igt

from intent2.xigt_helpers import xigt_find
from intent2.serialize.importers import parse_xigt_instance

# Load the testcase files
with open(seg_tests_path, 'r') as seg_tests_f:
    xc = xigtxml.load(seg_tests_f)  # type: XigtCorpus


# -------------------------------------------
# TestCases
# -------------------------------------------
class EsuTest(TestCase):
    def setUp(self):
        self.inst = xigt_find(xc, id='esu-58')  # type: Igt

    def test_segmentation(self):
        inst = parse_xigt_instance(self.inst)
        self.assertEqual(len(inst.gloss), 1)


class IkxTest(TestCase):
    def setUp(self):
Esempio n. 30
0
from collections import OrderedDict
from xigt.codecs import xigtxml

# etree is either from lxml.etree or xml.etree.ElementTree
etree = xigtxml.etree

### Decoding ###

### Encoding ###

### Function maps ###

if __name__ == '__main__':
    import sys
    from xigt.codecs import xigttxt
    f = sys.argv[1]
    xc = xigtxml.load(open(f,'r'))
    print(xigttxt.dumps(xc, pretty_print=True))
Esempio n. 31
0
from intent.igt.igtutils import get_judgment
from xigt.codecs import xigtxml
from xigt.consts import INCREMENTAL

if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument('FILE', nargs='+')
    p.add_argument('-d', '--dest', required=True, help='Output directory for modified files.')
    p.add_argument('-f', '--force', help='Force overwrite existing files.')

    args = p.parse_args()

    for path in args.FILE:
        with open(path, 'r', encoding='utf-8') as f:
            xc = xigtxml.load(f, mode=INCREMENTAL)

            for inst in xc:
                JUDG_LOG.info('Processing instance "{}"'.format(inst.id))
                for item in xigtpath.findall(inst, 'tier[@type='+ODIN_TIER_TYPE+ ']/item'):

                    # Skip blank lines
                    if item.value() is None:
                        continue

                    # Get the judgment and add it if it is non-null.
                    j = get_judgment(item.value())
                    if j is not None:
                        item.attributes[ODIN_JUDGMENT_ATTRIBUTE] = j
                        JUDG_LOG.debug('Judgment found on item "{}"'.format(item.id))