Beispiel #1
0
def run_tagger(corpus, outdir, tagger_jar):
    """
    Run the ark-tweet-tagger on all the (unannotated) documents in
    the corpus and save the results in the specified directory
    """
    for k in corpus:
        doc   = corpus[k]

        k_txt           = copy.copy(k)
        k_txt.stage     = 'turns'
        k_txt.annotator = None

        root  = stac.id_to_path(k_txt)
        txt_file = os.path.join(outdir, 'tmp', root + '.txt')
        txt_dir  = os.path.split(txt_file)[0]
        if not os.path.exists(txt_dir):
            os.makedirs(txt_dir)
        with codecs.open(txt_file, 'w', 'utf-8') as f:
            print(extract_turns(doc), file=f)

        tagged_file = tagger_file_name(k, outdir)
        tagged_dir  = os.path.split(tagged_file)[0]
        if not os.path.exists(tagged_dir):
            os.makedirs(tagged_dir)
        # from the runTagger srcipt
        cmd = tagger_cmd(tagger_jar, txt_file)
        with open(tagged_file, 'wb') as tf:
            subprocess.call(cmd, stdout=tf)
Beispiel #2
0
def run_tagger(corpus, outdir, tagger_jar):
    """
    Run the ark-tweet-tagger on all the (unannotated) documents in
    the corpus and save the results in the specified directory
    """
    for k in corpus:
        doc = corpus[k]

        k_txt = copy.copy(k)
        k_txt.stage = 'turns'
        k_txt.annotator = None

        root = stac.id_to_path(k_txt)
        txt_file = os.path.join(outdir, 'tmp', root + '.txt')
        txt_dir = os.path.split(txt_file)[0]
        if not os.path.exists(txt_dir):
            os.makedirs(txt_dir)
        with codecs.open(txt_file, 'w', 'utf-8') as f:
            print(extract_turns(doc), file=f)

        tagged_file = tagger_file_name(k, outdir)
        tagged_dir = os.path.split(tagged_file)[0]
        if not os.path.exists(tagged_dir):
            os.makedirs(tagged_dir)
        # from the runTagger srcipt
        cmd = tagger_cmd(tagger_jar, txt_file)
        with open(tagged_file, 'wb') as tf:
            subprocess.call(cmd, stdout=tf)
Beispiel #3
0
def parsed_file_name(k, dir_name):
    """
    Given an educe.corpus.FileId and directory, return the file path
    within that directory that corresponds to the corenlp output
    """
    k2 = copy.copy(k)
    k2.stage = 'parsed'
    k2.annotator = 'stanford-corenlp'
    return os.path.join(dir_name, stac.id_to_path(k2) + '.xml')
Beispiel #4
0
def tagger_file_name(k, dir):
    """
    Given an educe.corpus.FileId and directory, return the file path
    within that directory that corresponds to the tagger output
    """
    k2 = copy.copy(k)
    k2.stage     = 'pos-tagged'
    k2.annotator = 'ark-tweet-nlp'
    return os.path.join(dir, stac.id_to_path(k2) + '.conll')
Beispiel #5
0
def parsed_file_name(k, dir_name):
    """
    Given an educe.corpus.FileId and directory, return the file path
    within that directory that corresponds to the corenlp output
    """
    k2 = copy.copy(k)
    k2.stage = 'parsed'
    k2.annotator = 'stanford-corenlp'
    return os.path.join(dir_name, stac.id_to_path(k2) + '.xml')
Beispiel #6
0
def tagger_file_name(k, dir):
    """
    Given an educe.corpus.FileId and directory, return the file path
    within that directory that corresponds to the tagger output
    """
    k2 = copy.copy(k)
    k2.stage = 'pos-tagged'
    k2.annotator = 'ark-tweet-nlp'
    return os.path.join(dir, stac.id_to_path(k2) + '.conll')
Beispiel #7
0
 def mk_output_path(cls, odir, k, extension=''):
     """
     Generate a path within a parent directory, given a
     fileid
     """
     relpath = id_to_path(k)
     ofile_dirname = os.path.join(odir, os.path.dirname(relpath))
     ofile_basename = os.path.basename(relpath)
     return os.path.join(ofile_dirname, ofile_basename) + extension
Beispiel #8
0
 def mk_output_path(cls, odir, k, extension=''):
     """
     Generate a path within a parent directory, given a
     fileid
     """
     relpath = id_to_path(k)
     ofile_dirname = os.path.join(odir, os.path.dirname(relpath))
     ofile_basename = os.path.basename(relpath)
     return os.path.join(ofile_dirname, ofile_basename) + extension
Beispiel #9
0
def tagger_file_name(doc_key, root):
    """Get the file path to the output of the POS tagger for a document.

    The returned file path is a .conll file within the given directory.

    Parameters
    ----------
    doc_key : educe.corpus.FileId
        FileId of the document

    root : string
        Path to the folder containing annotations for this corpus,
        including the output of the POS tagger.

    Returns
    -------
    res : string
        Path to the .conll file output by the POS tagger.
    """
    doc_key2 = copy.copy(doc_key)
    doc_key2.stage = 'pos-tagged'
    doc_key2.annotator = 'ark-tweet-nlp'
    return os.path.join(root, stac.id_to_path(doc_key2) + '.conll')
Beispiel #10
0
def tagger_file_name(doc_key, root):
    """Get the file path to the output of the POS tagger for a document.

    The returned file path is a .conll file within the given directory.

    Parameters
    ----------
    doc_key : educe.corpus.FileId
        FileId of the document

    root : string
        Path to the folder containing annotations for this corpus,
        including the output of the POS tagger.

    Returns
    -------
    res : string
        Path to the .conll file output by the POS tagger.
    """
    doc_key2 = copy.copy(doc_key)
    doc_key2.stage = 'pos-tagged'
    doc_key2.annotator = 'ark-tweet-nlp'
    return os.path.join(root, stac.id_to_path(doc_key2) + '.conll')
Beispiel #11
0
def run_pipeline(corpus, outdir, corenlp_dir, split=False):
    """
    Run the standard corenlp pipeline on all the (unannotated) documents
    in the corpus and save the results in the specified directory.

    If `split=True`, we output one file per turn, an experimental mode
    to account for switching between multiple speakers.  We don't have
    all the infrastructure to read these back in (it should just be a
    matter of some filename manipulation though) and hope to flesh this
    out later.  We also intend to tweak the notion of splitting
    by aggregating consecutive turns with the same speaker, which may
    somewhat mitigate the loss of coreference information.
    """

    if split:
        # for each document, how many digits do we need to represent the
        # turns in that document; for essentially cosmetic purposes
        # (padding)
        digits = {}
        for d in frozenset([k.doc for k in corpus]):
            turns = []
            for k in corpus:
                if k.doc == d:
                    turns.extend([x for x in corpus[k].units
                                  if stac.is_turn(x)])
            turn_ids = [stac.turn_id(t)[0] for t in turns]
            digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids)))))

    # dump the turn text
    # TODO: aggregate consecutive turns by same speaker
    txt_files = []
    for k in corpus:
        doc = corpus[k]

        k_txt = copy.copy(k)
        k_txt.stage = 'turns'
        k_txt.annotator = None

        if split:
            nb_digits = digits[k.doc]
            for tid, ttext in turn_id_text(doc):
                root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits)

                txt_file = os.path.join(outdir, 'tmp', root + '.txt')
                txt_dir = os.path.split(txt_file)[0]
                if not os.path.exists(txt_dir):
                    os.makedirs(txt_dir)

                with codecs.open(txt_file, 'w', 'utf-8') as f:
                    print(ttext, file=f)

                txt_files.append(txt_file)
        else:
            root = stac.id_to_path(k_txt)
            txt_file = os.path.join(outdir, 'tmp', root + '.txt')
            txt_dir = os.path.split(txt_file)[0]
            if not os.path.exists(txt_dir):
                os.makedirs(txt_dir)
            with codecs.open(txt_file, 'w', 'utf-8') as f:
                for _, ttext in turn_id_text(doc):
                    print(ttext, file=f)
            txt_files.append(txt_file)

    # run CoreNLP
    corenlp_wrapper = CoreNlpWrapper(corenlp_dir)
    corenlp_props = [] if split else ['ssplit.eolonly=true']
    corenlp_outdir = corenlp_wrapper.process(txt_files, outdir,
                                             properties=corenlp_props)

    # corenlp dumps all the output into one flat directory;
    # move them to the standard STAC layout paths
    for sfile in os.listdir(corenlp_outdir):
        if os.path.splitext(sfile)[1] != '.xml':
            continue
        from_path = os.path.join(corenlp_outdir, sfile)
        # targeted (STAC) filename
        k, tid = from_corenlp_output_filename(sfile)
        to_path = parsed_file_name(k, outdir)
        to_dir = os.path.dirname(to_path)
        if not os.path.exists(to_dir):
            os.makedirs(to_dir)
        os.rename(from_path, to_path)
Beispiel #12
0
def run_pipeline(corpus, outdir, corenlp_dir, split=False):
    """
    Run the standard corenlp pipeline on all the (unannotated) documents in
    the corpus and save the results in the specified directory

    If `split=True`, we output one file per turn, an experimental mode
    to account for switching between multiple speakers.  We don't have
    all the infrastructure to read these back in (it should just be a
    matter of some filename manipulation though) and hope to flesh this
    out later.  We also intend to tweak the notion of splitting
    by aggregating consecutive turns with the same speaker, which may somewhat
    mitigate the lost of coreference information.
    """

    # for each document, how many digits do we need to represent the turns
    # in that document; for essentially cosmetic purposes (padding)
    digits = {}
    for d in frozenset([ k.doc for k in corpus ]):
        turns = []
        for k in corpus:
            if k.doc == d:
                turns.extend(filter(stac.is_turn, corpus[k].units))
        turn_ids  = [ int(t.features['Identifier']) for t in turns ]
        digits[d] = max(2,int(math.ceil(math.log10(max(turn_ids)))))

    # dump the turn text
    # TODO: aggregate consecutive turns by same speaker
    txt_files = []
    for k in corpus:
        doc   = corpus[k]
        turns = sorted(filter(stac.is_turn, doc.units),
                       key=lambda k:k.span)

        k_txt           = copy.copy(k)
        k_txt.stage     = 'turns'
        k_txt.annotator = None

        if split:
            for turn in turns:
                ttext = stac.split_turn_text(doc.text_for(turn))[1]
                tid   = turn.features['Identifier']
                root  = stac.id_to_path(k_txt) + '_' + tid.zfill(digits[k.doc])

                txt_file = os.path.join(outdir, 'tmp', root + '.txt')
                txt_dir  = os.path.split(txt_file)[0]
                if not os.path.exists(txt_dir):
                    os.makedirs(txt_dir)

                with codecs.open(txt_file, 'w', 'utf-8') as f:
                    print >> f, ttext

                txt_files.append(txt_file)
        else:
            root     = stac.id_to_path(k_txt)
            txt_file = os.path.join(outdir, 'tmp', root + '.txt')
            txt_dir  = os.path.split(txt_file)[0]
            if not os.path.exists(txt_dir):
                os.makedirs(txt_dir)
            with codecs.open(txt_file, 'w', 'utf-8') as f:
                for turn in turns:
                    ttext = stac.split_turn_text(doc.text_for(turn))[1]
                    print >> f, ttext
            txt_files.append(txt_file)

    # manifest tells corenlp what to files to read as input
    manifest_dir  = os.path.join(outdir, 'tmp')
    manifest_file = os.path.join(manifest_dir, 'manifest')
    with codecs.open(manifest_file, 'w', 'utf-8') as f:
        print >> f, '\n'.join(txt_files)

    # java properties to control behaviour of corenlp
    properties = [] if split else ['ssplit.eolonly=true']
    props_file = os.path.join(manifest_dir, 'corenlp.properties')
    with codecs.open(props_file, 'w', 'utf-8') as f:
        print >> f, '\n'.join(properties)

    # run corenlp (will take a while for it to load its various models)
    jars   = [ x for x in os.listdir(corenlp_dir) if os.path.splitext(x)[1] == '.jar' ]
    cp_sep = ':' if os.name != 'nt' else ';'

    corenlp_outdir = os.path.join(outdir, 'corenlp')
    if not os.path.exists(corenlp_outdir):
        os.makedirs(corenlp_outdir)

    cmd = [ 'java'
          , '-cp', cp_sep.join(jars)
          , '-Xmx3g'
          , 'edu.stanford.nlp.pipeline.StanfordCoreNLP'
          , '-filelist',  manifest_file
          , '-props',     props_file
          , '-outputDirectory', corenlp_outdir
          ]
    subprocess.call(cmd, cwd=corenlp_dir)

    # corenlp dumps all the output into one flat directory;
    # move them to the standard STAC layout paths
    for sfile in os.listdir(corenlp_outdir):
        if os.path.splitext(sfile)[1] != '.xml': continue
        k, tid = from_corenlp_output_filename(sfile)
        from_path = os.path.join(corenlp_outdir, sfile)
        to_path   = parsed_file_name(k, outdir)
        to_dir    = os.path.dirname(to_path)
        if not os.path.exists(to_dir):
            os.makedirs(to_dir)
        os.rename(from_path, to_path)
Beispiel #13
0
def run_pipeline(corpus, outdir, corenlp_dir, split=False):
    """
    Run the standard corenlp pipeline on all the (unannotated) documents
    in the corpus and save the results in the specified directory.

    If `split=True`, we output one file per turn, an experimental mode
    to account for switching between multiple speakers.  We don't have
    all the infrastructure to read these back in (it should just be a
    matter of some filename manipulation though) and hope to flesh this
    out later.  We also intend to tweak the notion of splitting
    by aggregating consecutive turns with the same speaker, which may
    somewhat mitigate the loss of coreference information.
    """

    if split:
        # for each document, how many digits do we need to represent the
        # turns in that document; for essentially cosmetic purposes
        # (padding)
        digits = {}
        for d in frozenset([k.doc for k in corpus]):
            turns = []
            for k in corpus:
                if k.doc == d:
                    turns.extend(
                        [x for x in corpus[k].units if stac.is_turn(x)])
            turn_ids = [stac.turn_id(t)[0] for t in turns]
            digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids)))))

    # dump the turn text
    # TODO: aggregate consecutive turns by same speaker
    txt_files = []
    for k in corpus:
        doc = corpus[k]

        k_txt = copy.copy(k)
        k_txt.stage = 'turns'
        k_txt.annotator = None

        if split:
            nb_digits = digits[k.doc]
            for tid, ttext in turn_id_text(doc):
                root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits)

                txt_file = os.path.join(outdir, 'tmp', root + '.txt')
                txt_dir = os.path.split(txt_file)[0]
                if not os.path.exists(txt_dir):
                    os.makedirs(txt_dir)

                with codecs.open(txt_file, 'w', 'utf-8') as f:
                    print(ttext, file=f)

                txt_files.append(txt_file)
        else:
            root = stac.id_to_path(k_txt)
            txt_file = os.path.join(outdir, 'tmp', root + '.txt')
            txt_dir = os.path.split(txt_file)[0]
            if not os.path.exists(txt_dir):
                os.makedirs(txt_dir)
            with codecs.open(txt_file, 'w', 'utf-8') as f:
                for _, ttext in turn_id_text(doc):
                    print(ttext, file=f)
            txt_files.append(txt_file)

    # run CoreNLP
    corenlp_wrapper = CoreNlpWrapper(corenlp_dir)
    corenlp_props = [] if split else ['ssplit.eolonly=true']
    corenlp_outdir = corenlp_wrapper.process(txt_files,
                                             outdir,
                                             properties=corenlp_props)

    # corenlp dumps all the output into one flat directory;
    # move them to the standard STAC layout paths
    for sfile in os.listdir(corenlp_outdir):
        if os.path.splitext(sfile)[1] != '.xml':
            continue
        from_path = os.path.join(corenlp_outdir, sfile)
        # targeted (STAC) filename
        k, tid = from_corenlp_output_filename(sfile)
        to_path = parsed_file_name(k, outdir)
        to_dir = os.path.dirname(to_path)
        if not os.path.exists(to_dir):
            os.makedirs(to_dir)
        os.rename(from_path, to_path)