if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='CoNLL file to be splited') parser.add_argument('file', help='CoNLL file to be split') parser.add_argument('--tail', action='store_true', default=False, help='tail/head -<n> file for the first piece. Default: head') parser.add_argument('--n', default=1000, help="Number of rows. Default: 1000") parser.add_argument('--first', help='File name for the first piece. Default: <file>.1.(count)') parser.add_argument('--second', help='File name for the second piece. Default: <file>.2.(count)') args = parser.parse_args() assert (args.first != args.second or (not args.first and not args.second)) with open2(args.file) as fp: corpus = [conll for conll in fp] if args.tail: corpus1 = corpus[:-args.n] corpus2 = corpus[-args.n:] else: corpus1 = corpus[:args.n] corpus2 = corpus[args.n:] file1 = args.first if args.first else filename(args.file, 1, len(corpus1)) file2 = args.second if args.second else filename(args.file, 2, len(corpus2)) print >> sys.stderr, "%s (%d) is splitted as \n\t%s (%d)\n\t%s (%d)" % ( args.file, len(corpus), file1, len(corpus1), file2, len(corpus2))
if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description='CoNLL file transformer') parser.add_argument('efile', help="Embedding file") parser.add_argument('files', metavar='f', nargs='+', help='List of CoNLL corpus files') args = parser.parse_args() words = defaultdict(int) for f in args.files: print >> sys.stderr, "Caching words in %s"%f with open2(f) as cf: for sentence in cf: for word in sentence: words[word._form] += 1 embeddings = {} with open(args.efile) as vf: for token in vf: fields = token.strip().split('\t') embeddings[fields[0]] = fields for word,freq in sorted(words.iteritems(), key=lambda x: x[1], reverse=True): if word in embeddings: print >> sys.stdout, "\t".join(embeddings[word])
__author__ = 'husnusensoy' if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description='Extracts A-B') parser.add_argument('fileA', help='CoNLL file A') parser.add_argument('fileB', help='CoNLL file B') parser.add_argument('--count', action='store_true', help='Just show number of overlapping sentence') args = parser.parse_args() d = dict() count = 0 with open2(args.fileA) as A, open2(args.fileB) as B: for s in B: d[tuple(s.sentence())] = True for s in A: if not tuple(s.sentence()) in d: if not args.count: for word in s: print >> sys.stdout, str(word) print >> sys.stdout else: count += 1 if args.count: print >> sys.stderr, "Total number of overlapping sentences are %d"%count
parser.add_argument('--target', type=str, default='FEATS', choices=['FEATS', 'LEMMA'], help="CoNLL file field to be replaced/extended") parser.add_argument('--replace', action='store_true', default=False, help="Replace/Extend the relevant field.") args = parser.parse_args() vlookup = {} with open(args.vectorfile) as vf: for token in vf: fields = token.strip().split('\t') vlookup[fields[0]] = fields[1:] with open2(args.file) as cf: for sentence in cf: for word in sentence: if word._form in vlookup: if args.target == 'FEATS': if word._feats or args.replace: word._feats = word._feats + "|" + "|".join( ("F%d=%s" % (i, v) for i, v in enumerate(vlookup[word._form]))) else: word._feats = "|".join(("F%d=%s" % (i, v) for i, v in enumerate(vlookup[word._form]))) else: assert len(vlookup[word._form]) == 1 word._lemma = vlookup[word._form][0] print >> sys.stdout, str(word) print >> sys.stdout