Ejemplo n.º 1
0
def string_sentence_ruleapplication(sentence_segment, format='xml', idx=True, id=-1, filtering=None, timeout=None, **kwargs):
    try:
        sentence_collection = loads(sentence_segment, format=format)
    except xml.ParseError:
        raise Exception('Error parsing sentence XML for id %d' % id)

    read_graphs(sentence_collection, idx=idx)

    return sentence_ruleapplication(sentence_collection[0], filtering=filtering, timeout=timeout, **kwargs)
Ejemplo n.º 2
0
def dataset_translation_rulextraction(sentence_collection,
                                      idx=True,
                                      filtering=None,
                                      timeout=None,
                                      **kwargs):
    read_graphs(sentence_collection, idx=idx)

    rules = list()

    for sentence in sentence_collection:
        if idx:
            tok = sentence.target.tokenized_idx
        else:
            tok = sentence.target.tokenized_text

        if filtering is not None and not filtering.filter(sentence):
            logging.info(
                'Skipping sentence with id %s (%s) due to filtering (source token number %d, graph size %d)'
                % (sentence.id, sentence.orig_id,
                   len(sentence.source.tokenized_text),
                   len(sentence.source.graph)))
            continue

        if sentence.source.graph is not None and tok is not None and sentence.alignment is not None and sentence.alignment.sgtt is not None:
            logging.debug('Starting rule extraction for sentence %s' %
                          sentence.id)

            alignments = dict(index
                              for plain, index in sentence.alignment.sgtt)
            alignment_dict = utility.create_alignment_dict(
                alignments, sentence.source.graph)

            tok = [unicode(x) for x in tok]

            try:
                if timeout:
                    with to.timeout(seconds=timeout):
                        sentence_rules = rulextraction(sentence.source.graph,
                                                       tok, alignment_dict,
                                                       **kwargs)
                else:
                    sentence_rules = rulextraction(sentence.source.graph, tok,
                                                   alignment_dict, **kwargs)

                rules.extend(sentence_rules)
                logging.info(
                    'Extracted %d rules from sentence with id %s (%s)' %
                    (len(sentence_rules), sentence.id, sentence.orig_id))

            except to.TimeoutError:
                logging.warn(
                    'Rule extraction for sentence with id %s (%s) failed due to timeout after %d seconds'
                    % (sentence.id, sentence.orig_id, timeout))
                continue

    return rules
Ejemplo n.º 3
0
def dataset_ruleapplication(sentence_collection, idx=True, filtering=None, timeout=None, **kwargs):
    read_graphs(sentence_collection, idx=idx)

    graph_coverages = dict()

    for sentence in sentence_collection:
        sent_id, applied_rules = sentence_ruleapplication(sentence, filtering=filtering, timeout=timeout, **kwargs)
        graph_coverages[sent_id] = applied_rules

    return graph_coverages
Ejemplo n.º 4
0
def string_create_sentence_disc_rules(sentence_xml_string, _, disc_rule_id_offset=120000000, idx=True, filtering=None, format='xml'):

    try:
        sentence_collection = loads(sentence_xml_string, format=format)
    except xml.ParseError:
        raise Exception('Error parsing sentence XML for id %d' % id)

    read_graphs(sentence_collection, idx=idx)

    sentence = sentence_collection[0]

    return create_sentence_disc_rules(sentence, disc_rule_id_offset, filtering)