Esempio n. 1
0
	if not options.filename:
                logger.error("ERROR: No filename or directory given")
                raise SystemExit(1)
        else:
                logger.debug("FILENAME = %s" % options.filename)


	###########################################################
	logger.info(options.filename)
	trunc=None

	

	## Initialize the corpus class but don't create a dictionary
	if options.corpustype == "ted":
		corpus_sent = TedCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc)
		corpus_sent.input=options.filename
	elif options.corpustype == "fisher": 
		corpus_sent = FisherCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc)
		corpus_sent.input=options.filename
	elif options.corpustype == "gigaword": 
		corpus_sent = GigawordCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc)
		corpus_sent.input=options.filename
	elif options.corpustype == None:
		logger.error("No corpus type specified, exiting...")
		raise SystemExit(1)
	else:
		logger.error("Unknown corpus type: %s, exiting..." % options.corpustype) 
		raise SystemExit(1)

Esempio n. 2
0
	if options.cwlist:
		cwsuffix = "." + os.path.basename(options.cwlist).split()[0]
		logger.debug("cwsuffix: %s" % cwsuffix)
		## Initialize dict with pre-existing cue phrases
		with open(options.cwlist, "r") as f:
			for line in f:
				cw = line.split("\t")[0].lower()
				cuedict[cw] = 0

	if options.cuewords:
		pstem = "/tmp/cwtmp"
		outf = open("pstem", "w")

		## Initialize the corpus class but don't create a dictionary
		if options.corpustype == "ted":
			corpus_sent = TedCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc)
			corpus_sent.input=options.filename
		elif options.corpustype == "fisher": 
			corpus_sent = FisherCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc)
			corpus_sent.input=options.filename
		elif options.corpustype == "gigaword": 
			corpus_sent = GigawordCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc)
			corpus_sent.input=options.filename
		elif options.corpustype == None:
			logger.error("No corpus type specified, exiting...")
			raise SystemExit(1)
		else:
			logger.error("Unknown corpus type: %s, exiting..." % options.corpustype) 
			raise SystemExit(1)

		for convid, (cue, (id, sid)) in corpus_sent.get_texts(trunc=trunc, lemma=False, sw=[], convfilter=pd.DataFrame(), convid=True):
Esempio n. 3
0
	###########################################################
	logger.info(options.filename)

	logger.info("Generate corpus and dictionary from text") 
	logger.info("ted corpus")

	cuedict = {}
	if options.trunc:
		trunc = int(options.trunc)
	else:
		trunc = None

	## Initialize the corpus class but don't create a dictionary
	if options.corpustype == "ted":
		logger.debug("CORPUSTYPE = ted")
		corpus_sent = TedCorpus(input=options.filename, prefix=options.prefix, sentseg=True, trunc=trunc)
		corpus = TedCorpus(input=options.filename, prefix=options.prefix, sentseg=False, trunc=trunc)
	elif options.corpustype == "fisher": 
		corpus_sent = FisherCorpus(input=options.filename, prefix=options.prefix, sentseg=True, trunc=trunc)
		corpus = FisherCorpus(input=options.filename, prefix=options.prefix, sentseg=False, trunc=trunc)
	elif options.corpustype == "gigaword": 
		corpus_sent = GigawordCorpus(input=options.filename, prefix=options.prefix, sentseg=True, trunc=trunc)
		corpus = GigawordCorpus(input=options.filename, prefix=options.prefix, sentseg=False, trunc=trunc)
	elif options.corpustype == None:
		logger.error("No corpus type specified, exiting...")
		raise SystemExit(1)
	else:
		logger.error("Unknown corpus type: %s, exiting..." % options.corpustype) 
		raise SystemExit(1)

	#corpus = TedCorpus(input=options.filename, prefix=options.prefix, sentseg=False)