if not options.filename: logger.error("ERROR: No filename or directory given") raise SystemExit(1) else: logger.debug("FILENAME = %s" % options.filename) ########################################################### logger.info(options.filename) trunc=None ## Initialize the corpus class but don't create a dictionary if options.corpustype == "ted": corpus_sent = TedCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc) corpus_sent.input=options.filename elif options.corpustype == "fisher": corpus_sent = FisherCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc) corpus_sent.input=options.filename elif options.corpustype == "gigaword": corpus_sent = GigawordCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc) corpus_sent.input=options.filename elif options.corpustype == None: logger.error("No corpus type specified, exiting...") raise SystemExit(1) else: logger.error("Unknown corpus type: %s, exiting..." % options.corpustype) raise SystemExit(1)
if options.cwlist: cwsuffix = "." + os.path.basename(options.cwlist).split()[0] logger.debug("cwsuffix: %s" % cwsuffix) ## Initialize dict with pre-existing cue phrases with open(options.cwlist, "r") as f: for line in f: cw = line.split("\t")[0].lower() cuedict[cw] = 0 if options.cuewords: pstem = "/tmp/cwtmp" outf = open("pstem", "w") ## Initialize the corpus class but don't create a dictionary if options.corpustype == "ted": corpus_sent = TedCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc) corpus_sent.input=options.filename elif options.corpustype == "fisher": corpus_sent = FisherCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc) corpus_sent.input=options.filename elif options.corpustype == "gigaword": corpus_sent = GigawordCorpus(input=None, prefix=options.prefix, sentseg=True, trunc=trunc) corpus_sent.input=options.filename elif options.corpustype == None: logger.error("No corpus type specified, exiting...") raise SystemExit(1) else: logger.error("Unknown corpus type: %s, exiting..." % options.corpustype) raise SystemExit(1) for convid, (cue, (id, sid)) in corpus_sent.get_texts(trunc=trunc, lemma=False, sw=[], convfilter=pd.DataFrame(), convid=True):
########################################################### logger.info(options.filename) logger.info("Generate corpus and dictionary from text") logger.info("ted corpus") cuedict = {} if options.trunc: trunc = int(options.trunc) else: trunc = None ## Initialize the corpus class but don't create a dictionary if options.corpustype == "ted": logger.debug("CORPUSTYPE = ted") corpus_sent = TedCorpus(input=options.filename, prefix=options.prefix, sentseg=True, trunc=trunc) corpus = TedCorpus(input=options.filename, prefix=options.prefix, sentseg=False, trunc=trunc) elif options.corpustype == "fisher": corpus_sent = FisherCorpus(input=options.filename, prefix=options.prefix, sentseg=True, trunc=trunc) corpus = FisherCorpus(input=options.filename, prefix=options.prefix, sentseg=False, trunc=trunc) elif options.corpustype == "gigaword": corpus_sent = GigawordCorpus(input=options.filename, prefix=options.prefix, sentseg=True, trunc=trunc) corpus = GigawordCorpus(input=options.filename, prefix=options.prefix, sentseg=False, trunc=trunc) elif options.corpustype == None: logger.error("No corpus type specified, exiting...") raise SystemExit(1) else: logger.error("Unknown corpus type: %s, exiting..." % options.corpustype) raise SystemExit(1) #corpus = TedCorpus(input=options.filename, prefix=options.prefix, sentseg=False)