Beispiel #1
0
def read_conll_depparse(filename):
    fin = smart_file_handler(filename, 'r')

    all_sents = []
    sent_idx = 0
    sent = Sentence(sent_idx)

    for line_idx, line in enumerate(fin.readlines()):
        if line == '\n':
            all_sents.append(deepcopy(sent))
            sent_idx += 1
            sent = Sentence(sent_idx)
        else:
            items = line.strip().split('\t')
            try:
                token_idx = int(items[0])
            except ValueError:
                continue
            if token_idx == sent.num_tokens:
                log.warning(
                    'line #{} ({}) has duplicated token index, ignored.'.
                    format(line_idx,
                           line.strip().replace('\t', ' ')))
                continue
            word = items[1]
            lemma = items[2]
            pos = items[4]
            sent.add_token(Token(word, lemma, pos))
            try:
                head_idx = int(items[6])
            except ValueError:
                continue
            dep_label = items[7]
            if dep_label != 'root':
                sent.add_dep(
                    Dependency(label=dep_label,
                               head_idx=head_idx - 1,
                               mod_idx=token_idx - 1,
                               extra=False))
            if items[8] != '_':
                for e_dep in items[8].strip().split('|'):
                    try:
                        e_dep_head_idx = int(e_dep.split(':')[0])
                    except ValueError:
                        continue
                    e_dep_label = ':'.join(e_dep.split(':')[1:])
                    sent.add_dep(
                        Dependency(label=e_dep_label,
                                   head_idx=e_dep_head_idx - 1,
                                   mod_idx=token_idx - 1,
                                   extra=True))

    return all_sents
Beispiel #2
0
def read_doc_from_corenlp(filename):
    log.info('Reading CoreNLP document from {}'.format(filename))
    input_xml = smart_file_handler(filename)

    xml_parser = etree.XMLParser(target=CoreNLPTarget())
    sents, corefs = etree.parse(input_xml, xml_parser)
    doc_name = splitext(basename(filename))[0]
    doc = Document.construct(doc_name, sents, corefs)

    input_xml.close()

    return doc
Beispiel #3
0
from utils import smart_file_handler

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('input_path',
                        help='directory to CoreNLP parsed xml files')
    parser.add_argument('output_path', help='path to write script corpus file')
    parser.add_argument('-v',
                        '--verbose',
                        help='print all document names',
                        action='store_true')

    args = parser.parse_args()

    input_files = sorted([
        join(args.input_path, f) for f in listdir(args.input_path)
        if isfile(join(args.input_path, f)) and f.endswith('xml.bz2')
    ])

    script_corpus = ScriptCorpus()

    for input_f in input_files:
        doc = read_corenlp_doc(input_f, verbose=args.verbose)
        script = Script.from_doc(doc)
        if script.has_events():
            script_corpus.add_script(script)

    with smart_file_handler(args.output_path, 'w') as fout:
        fout.write(script_corpus.to_text())
 def __iter__(self):
     for fname in listdir(self.dirname):
         for line in smart_file_handler(join(self.dirname, fname), 'r'):
             yield line.split()
 def __iter__(self):
     for filename in self.filenames:
         for line in smart_file_handler(filename, 'r').readlines():
             line = line.strip()
             if line:
                 yield self.from_text_fn(line)