def extract(item): date, text = item extractor = Extractor(text=text, max_len=max_len) words = extractor.extract_words(thresh=thresh) words['date'] = date return words, date
required=False, default=False, type=bool, dest='preprocess') if __name__ == '__main__': tic = time() args = parser.parse_args() rfpath = join(RFDIR, args.fname) print(args.preprocess, args.count) if not args.preprocess: try: text = open(rfpath, "r").readlines() except: text = open(rfpath, "r", encoding="utf-8").readlines() text = [line.strip() for line in text] extracter = Extractor(text=text, max_len=args.ngram) else: extracter = Extractor(rfpath=rfpath, max_len=args.ngram) words = extracter.extract_words(score_thresh=args.thresh, cnt_thresh=args.count) if args.save: if args.oname: opath = join(WFDIR, args.oname) words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t') else: opath = join(WFDIR, args.fname) words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t') print(words) toc = time() print("Total time: %.2fs" % (toc - tic))
default=4.0, type=float, dest='thresh') parser.add_argument("-n", "--ngram", required=False, default=4, type=int, dest='ngram') parser.add_argument("--save", required=False, default=False, type=bool, dest='save') if __name__ == '__main__': tic = time() args = parser.parse_args() rfpath = join(RFDIR, args.fname) extracter = Extractor(rfpath, max_len=args.ngram) words = extracter.extract_words(thresh=args.thresh) if args.save: if args.oname: opath = join(WFDIR, args.oname) words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t') else: opath = join(WFDIR, args.fname) words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t') print(words) toc = time() print("Total time: %.2fs" % (toc - tic))