예제 #1
0
def extract(item):
    date, text = item
    extractor = Extractor(text=text, max_len=max_len)
    words = extractor.extract_words(thresh=thresh)
    words['date'] = date
    return words, date
예제 #2
0
                    required=False,
                    default=False,
                    type=bool,
                    dest='preprocess')

if __name__ == '__main__':
    tic = time()
    args = parser.parse_args()
    rfpath = join(RFDIR, args.fname)
    print(args.preprocess, args.count)
    if not args.preprocess:
        try:
            text = open(rfpath, "r").readlines()
        except:
            text = open(rfpath, "r", encoding="utf-8").readlines()
        text = [line.strip() for line in text]
        extracter = Extractor(text=text, max_len=args.ngram)
    else:
        extracter = Extractor(rfpath=rfpath, max_len=args.ngram)
    words = extracter.extract_words(score_thresh=args.thresh,
                                    cnt_thresh=args.count)
    if args.save:
        if args.oname:
            opath = join(WFDIR, args.oname)
            words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t')
        else:
            opath = join(WFDIR, args.fname)
            words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t')
    print(words)
    toc = time()
    print("Total time: %.2fs" % (toc - tic))
예제 #3
0
                    default=4.0,
                    type=float,
                    dest='thresh')
parser.add_argument("-n",
                    "--ngram",
                    required=False,
                    default=4,
                    type=int,
                    dest='ngram')
parser.add_argument("--save",
                    required=False,
                    default=False,
                    type=bool,
                    dest='save')

if __name__ == '__main__':
    tic = time()
    args = parser.parse_args()
    rfpath = join(RFDIR, args.fname)
    extracter = Extractor(rfpath, max_len=args.ngram)
    words = extracter.extract_words(thresh=args.thresh)
    if args.save:
        if args.oname:
            opath = join(WFDIR, args.oname)
            words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t')
        else:
            opath = join(WFDIR, args.fname)
            words.to_csv(opath, encoding="utf_8_sig", index=False, sep='\t')
    print(words)
    toc = time()
    print("Total time: %.2fs" % (toc - tic))