コード例 #1
0
ファイル: splitconll.py プロジェクト: hsensoy/uparse
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='CoNLL file to be splited')
    parser.add_argument('file', help='CoNLL file to be split')
    parser.add_argument('--tail', action='store_true', default=False,
                        help='tail/head -<n> file for the first piece. Default: head')
    parser.add_argument('--n', default=1000, help="Number of rows. Default: 1000")
    parser.add_argument('--first', help='File name for the first piece. Default: <file>.1.(count)')
    parser.add_argument('--second', help='File name for the second piece.  Default: <file>.2.(count)')

    args = parser.parse_args()

    assert (args.first != args.second or (not args.first and not args.second))

    with open2(args.file) as fp:
        corpus = [conll for conll in fp]

        if args.tail:
            corpus1 = corpus[:-args.n]
            corpus2 = corpus[-args.n:]
        else:
            corpus1 = corpus[:args.n]
            corpus2 = corpus[args.n:]

        file1 = args.first if args.first else filename(args.file, 1, len(corpus1))
        file2 = args.second if args.second else filename(args.file, 2, len(corpus2))

        print >> sys.stderr, "%s (%d) is splitted as \n\t%s (%d)\n\t%s (%d)" % (
            args.file, len(corpus), file1, len(corpus1), file2, len(corpus2))
コード例 #2
0
ファイル: sortembeddings.py プロジェクト: hsensoy/uparse
if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description='CoNLL file transformer')
    parser.add_argument('efile', help="Embedding file")
    parser.add_argument('files', metavar='f', nargs='+',
                        help='List of CoNLL corpus files')

    args = parser.parse_args()

    words = defaultdict(int)
    for f in args.files:
        print >> sys.stderr, "Caching words in %s"%f
        with open2(f) as cf:
            for sentence in cf:
                for word in sentence:
                    words[word._form] += 1

    embeddings = {}
    with open(args.efile) as vf:
        for token in vf:
            fields = token.strip().split('\t')

            embeddings[fields[0]] = fields

    for word,freq in sorted(words.iteritems(), key=lambda x: x[1], reverse=True):
        if word in embeddings:
            print >> sys.stdout, "\t".join(embeddings[word])
コード例 #3
0
ファイル: diffconll.py プロジェクト: hsensoy/uparse
__author__ = 'husnusensoy'

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description='Extracts A-B')
    parser.add_argument('fileA', help='CoNLL file A')
    parser.add_argument('fileB', help='CoNLL file B')
    parser.add_argument('--count', action='store_true', help='Just show number of overlapping sentence')

    args = parser.parse_args()
    d = dict()
    count = 0

    with open2(args.fileA) as A, open2(args.fileB) as B:
        for s in B:
            d[tuple(s.sentence())] = True

        for s in A:
            if not tuple(s.sentence()) in d:
		if not args.count:
                    for word in s:
                        print >> sys.stdout, str(word)
		
                    print >> sys.stdout
	    else:
		count += 1

    if args.count:
	print >> sys.stderr, "Total number of overlapping sentences are %d"%count
コード例 #4
0
ファイル: discretevectorconll.py プロジェクト: hsensoy/uparse
    parser.add_argument('--target', type=str, default='FEATS',
                        choices=['FEATS', 'LEMMA'],
                        help="CoNLL file field to be replaced/extended")
    parser.add_argument('--replace', action='store_true', default=False,
                        help="Replace/Extend the relevant field.")

    args = parser.parse_args()

    vlookup = {}
    with open(args.vectorfile) as vf:
        for token in vf:
            fields = token.strip().split('\t')

            vlookup[fields[0]] = fields[1:]

    with open2(args.file) as cf:
        for sentence in cf:
            for word in sentence:
                if word._form in vlookup:
                    if args.target == 'FEATS':
                        if word._feats or args.replace:
                            word._feats = word._feats + "|" + "|".join(
                                ("F%d=%s" % (i, v) for i, v in enumerate(vlookup[word._form])))
                        else:
                            word._feats = "|".join(("F%d=%s" % (i, v) for i, v in enumerate(vlookup[word._form])))
                    else:
                        assert len(vlookup[word._form]) == 1
                        word._lemma = vlookup[word._form][0]
                print >> sys.stdout, str(word)

            print >> sys.stdout