コード例 #1
0
ファイル: parsing.py プロジェクト: rgeorgi/intent
def parse_odin_inst(string, corpus = None, idnum=None):
    """
    Method to parse and create an IGT instance from odin-style text.
    """

    # Start by looking for the doc_id, and the line range.
    doc_re = re.search('doc_id=(\S+)\s([0-9]+)\s([0-9]+)\s(.*)\n', string)
    docid, lnstart, lnstop, tagtypes = doc_re.groups()

    if idnum is not None:
        id = gen_item_id('i', idnum)
    elif corpus:
        id = corpus.askIgtId()
    else:
        corpus = XigtCorpus()
        id = 'i{}'.format(len(corpus))

    inst = Igt(id = id, attributes={'doc-id':docid,
                                    'line-range':'%s %s' % (lnstart, lnstop),
                                    'tag-types':tagtypes})

    # Now, find all the lines
    lines = re.findall('line=([0-9]+)\stag=(\S+):(.*)\n?', string)

    # --- 3) Create a raw tier.
    rt = Tier(id = RAW_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:RAW_STATE}, igt=inst)

    for lineno, linetag, linetxt in lines:
        l = Item(id = ask_item_id(rt), text=linetxt, attributes={'tag':linetag, 'line':lineno}, tier=rt)
        rt.append(l)

    inst.append(rt)
    basic_processing(inst)

    return inst
コード例 #2
0
ファイル: igt_operations.py プロジェクト: xigt/yggdrasil
def create_text_tier_from_lines(inst, lines, id_base, state):
    """
    Given a list of lines that are dicts with the attributes 'text' and 'tag', create
    a text tier of the specified type with the provided line items.

    :type lines: list[dict]
    """
    # -------------------------------------------
    # 1) Generate the parent tier.
    tier = Tier(id=gen_tier_id(inst, id_base), type=ODIN_TYPE, attributes={STATE_ATTRIBUTE:state})


    # -------------------------------------------
    # 2) Iterate over the list of lines
    for line in lines:

        # Make sure the line is a dict.
        if not hasattr(line, 'get') or 'text' not in line or 'tag' not in line:
            raise Exception("When constructing tier from lines, must be a list of dicts with keys 'text' and 'tag'.")

        # Construct the list of tags.
        alltags = []
        if line.get('tag') is not None:
            alltags.append(line.get('tag'))
        if line.get('labels') is not None and line.get('labels'):
            alltags.append(line.get('labels'))
        tag_str = '+'.join(alltags)


        # Construct the attributes
        line_attributes = {ODIN_TAG_ATTRIBUTE:tag_str}
        if line.get('judgment') is not None:
            line_attributes[ODIN_JUDGMENT_ATTRIBUTE] = line['judgment']

        # Add the linenumber
        if line.get('lineno'):
            line_attributes['line'] = line.get('lineno', '')


        l = Item(id=gen_item_id(tier.id, len(tier)),
                   attributes=line_attributes,
                   text=line.get('text'))
        tier.append(l)
    return tier
コード例 #3
0
ファイル: parsing.py プロジェクト: rgeorgi/intent
def raw_txt_to_inst(string, corpus=None, idnum=None):
    """
    Method to create an IGT instance from a raw three lines of text, assuming L-G-T.

    :param string:
    :param corpus:
    :param idnum:
    """
    lines = string.split('\n')
    if len(lines) < 3:
        raise RawTextParseError("Three lines are assumed for raw text. Instead got {}".format(len(lines)))


    if idnum is not None:
        id = gen_item_id('i', idnum)
    elif corpus:
        id = corpus.askIgtId()
    else:
        corpus = XigtCorpus()
        id = 'i{}'.format(len(corpus))

    inst = Igt(id = id)
    rt = Tier(id = RAW_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:RAW_STATE}, igt=inst)

    for i, l in enumerate(lines):

        # If we have four lines, assume that the first is
        # native orthography
        if len(lines) == 4:
            if i == 0:
                linetag = ODIN_LANG_TAG + '+FR'
            if i == 1:
                linetag = ODIN_LANG_TAG
            if i == 2:
                linetag = ODIN_GLOSS_TAG
            if i == 3:
                linetag = ODIN_TRANS_TAG

        elif len(lines) == 3:
            if i == 0:
                linetag = ODIN_LANG_TAG
            elif i == 1:
                linetag = ODIN_GLOSS_TAG
            elif i == 2:
                linetag = ODIN_TRANS_TAG

        elif len(lines) == 2:
            if i == 0:
                linetag = ODIN_LANG_TAG
            if i == 1:
                linetag = ODIN_TRANS_TAG

        else:
            raise RawTextParseError("Unknown number of lines...")

        if not l.strip():
            raise RawTextParseError("The {} line is empty: {}".format(linetag, l))

        li = Item(id=ask_item_id(rt), text=l, attributes={'tag':linetag})
        rt.append(li)

    inst.append(rt)
        # CONVERT_LOG.warn("Basic processing failed for instance {}".format(inst.id))
    return inst
コード例 #4
0
ファイル: parsing.py プロジェクト: rgeorgi/intent
def parse_odin_xc(text, require_trans = True, require_gloss = True, require_lang = True, limit = None):
    """
    Read in a odin-style textfile to create the xigt corpus.

    """
    # Initialize the corpus
    xc = XigtCorpus()

    # Replace invalid characters...
    data = replace_invalid_xml(text)

    # Read all the text lines
    inst_txts = re.findall('doc_id=[\s\S]+?\n\n', data)

    #=======================================================================
    # Begin parsing...
    #=======================================================================

    parsed = 0
    PARSELOG.info('Beginning parse')
    for inst_num, inst_txt in enumerate(inst_txts):

        if parsed % 250 == 0:
            PARSELOG.info('Parsing instance %d...' % parsed)
            pass

        # Handle the requirement for 1_to_1 alignment.
        try:
            i = parse_odin_inst(inst_txt, corpus=xc, idnum=inst_num)
        except GlossLangAlignException as glae:
            PARSELOG.warn('Gloss and language could not be automatically aligned for instance "%s". Skipping' % gen_item_id('i', inst_num))
            continue

        # Try to get the translation line. ---------------------------------
        try:
            hastrans = trans_lines(i)
        except NoTransLineException as ntle:
            PARSELOG.info(ntle)
            hastrans = False

        # Try to get the gloss line. --------------------------------------
        try:
            hasgloss = i.gloss
        except NoGlossLineException as ngle:
            PARSELOG.info(ngle)
            hasgloss = False

        # Try to get the language line. ------------------------------------
        try:
            haslang = i.lang
        except NoLangLineException as nlle:
            PARSELOG.info(nlle)
            haslang = False


        parsed +=1


        trans_constraint = (hastrans and require_trans) or (not require_trans)
        gloss_constraint = (hasgloss and require_gloss) or (not require_gloss)
        lang_constraint  = (haslang  and require_lang)  or (not require_lang)

        if trans_constraint and gloss_constraint and lang_constraint:
            xc.append(i)
        else:
            PARSELOG.info('Requirements for instance "%s" were not satisfied. Skipping' % i.id)

        # If we have reached the limit of instances that have been requested,
        # stop processing.
        if limit is not None and limit == parsed: break



    # Return the corpus
    return xc
コード例 #5
0
ファイル: id_tests.py プロジェクト: rgeorgi/intent
 def standard_id_test(self):
     self.assertEqual('i1', gen_item_id('i', 0))