def main(): """ main """ lang_codes = ['en'] langid_min_prob = 0.7 replacements = json.load(open('replacements.json')) parser = argparse.ArgumentParser() parser.add_argument('tweet_infiles', help='input file paths comma seperated') parser.add_argument('dest_files', help='output file paths comma seperated') parser.add_argument('-l', '--lang_codes') parser.add_argument('-p', '---langid_min_prob', type=float, help='outputs only tweets that have langid_min_prob \ or higher probability') parser.add_argument('-n', '--num_jobs', type=int, default=0, help='number of worker processes to use. Default: \ number of cores') parser.add_argument('-s', '--queue_size', type=int, default=2000) args = parser.parse_args() tweet_files = args.tweet_infiles.split(',') dest_files = args.dest_files.split(',') if args.lang_codes: lang_codes = args.lang_codes.split(',') if not len(tweet_files) == len(dest_files): print('tweet_files and dest_files are different sizes') sys.exit(0) if not len(dest_files) == len(lang_codes): print('different number of files and language codes') sys.exit(0) if args.langid_min_prob: langid_min_prob = args.langid_min_prob for source, dest, lang in zip(tweet_files, dest_files, lang_codes): func = partial(filter_classify_lang_line, lang, langid_min_prob, replacements) multiprocess = MultiprocessFiles(source, dest, func, num_procs=args.num_jobs, queue_size=args.queue_size) multiprocess.run()
def main(): """ main """ parser = argparse.ArgumentParser() parser.add_argument('tweet_infiles', help='input files comma seperated') parser.add_argument('dest_files', help='output files comma seperated') parser.add_argument('-s', '--simple', dest='simple', action='store_true', help='selects simple tokenizer instead of twokenizer') parser.add_argument('-t', '--twokenize', dest='twokenize', action='store_true', help='twokenizer that does not break apostroph words') parser.add_argument('-a', '--twokenize2', dest='twokenize2', action='store_true', help='twokenizer that breaks apostroph words') parser.add_argument('-n', '--num_jobs', type=int, default=0, help='number of worker processes to use. Default: \ number of cores') parser.add_argument('-q', '--queue_size', type=int, default=2000) args = parser.parse_args() tweet_files = args.tweet_infiles.split(',') dest_files = args.dest_files.split(',') if not len(tweet_files) == len(dest_files): print('tweet_files and dest_files are different sizes') sys.exit(0) tokenize_function = twokenize.tokenize2 if args.simple: tokenize_function = word_tokenize print("Simple Tokenize") if args.twokenize: tokenize_function = twokenize.tokenize print("Tokenize") if args.twokenize2: tokenize_function = twokenize.tokenize2 print("Tokenize 2") func = partial(tokenize_tweet, tokenize_function) for source, dest in zip(tweet_files, dest_files): multiprocess = MultiprocessFiles(source, dest, func, num_procs=args.num_jobs, queue_size=args.queue_size) multiprocess.run()
def main(): """ main """ # default parameters min_tokens = 5 max_num_urls = 2 max_num_users = 3 replacements = json.load(open('replacements.json')) parser = argparse.ArgumentParser() parser.add_argument('input_files', help='input file paths comma seperated') parser.add_argument('output_files', help='output file paths comma seperated') parser.add_argument('-t', '--min_tokens', type=int) parser.add_argument('-r', '--max_urls', type=int) parser.add_argument('-u', '--max_users', type=int) parser.add_argument('-n', '--num_jobs', type=int, default=0, help='number of worker processes to use. Default: \ number of cores') parser.add_argument('-s', '--queue_size', type=int, default=2000) args = parser.parse_args() if args.min_tokens: min_tokens = args.min_tokens if args.max_urls: max_num_urls = args.max_urls if args.max_users: max_num_users = args.max_users infiles = args.input_files.split(',') outfiles = args.output_files.split(',') if not len(infiles) == len(outfiles): print('Input files and output_files do not match in size') sys.exit(0) func = partial(preprocess_tweet, min_tokens, max_num_urls, max_num_users, replacements) for infile, outfile in zip(infiles, outfiles): multiprocess = MultiprocessFiles(infile, outfile, func, num_procs=args.num_jobs, queue_size=args.queue_size) multiprocess.run()
def main(): """ main """ parser = argparse.ArgumentParser() parser.add_argument('-n', '--num_jobs', type=int, default=0, help='number of worker processes to use. Default: \ number of cores') parser.add_argument('-s', '--queue_size', type=int, default=2000) parser.add_argument('-b', '--break_hashtags', action="store_true", dest="break_hashtags", default=False) parser.add_argument('-l', '--lowercase', action="store_true", dest="lowercase", default=False) parser.add_argument('input_tweet_file') parser.add_argument('output_file') args = parser.parse_args() infile = args.input_tweet_file outfile = args.output_file replacements = json.load(open('replacements.json')) if (not args.lowercase) and (not args.break_hashtags) and \ (not replacements['number']): print('Nothing to do') sys.exit(0) pp2 = partial(preprocess2, args.lowercase, args.break_hashtags, replacements) multiprocess = MultiprocessFiles(infile, outfile, pp2, queue_size=args.queue_size, num_procs=args.num_jobs) multiprocess.run()
def main(): """ main """ replacements = json.load(open('replacements.json')) # Stats Vars n_pos = 0 n_neg = 0 n = 0 POS = True NEG = False parser = argparse.ArgumentParser() # Basic parser.add_argument('tweets_file') parser.add_argument('-p', '--news_feed_path', default=None) parser.add_argument('lang_codes', help='lang codes comma-seperated') parser.add_argument('-j', '--num_jobs', default=0, type=int, help='0 uses all cores available') parser.add_argument('-q', '--queue_size', type=int, default=2000) parser.add_argument('output_dir') # Exclude Tweets parser.add_argument('-e', '--exclude_ids', action='store_true', default=False) parser.add_argument('-f', '--ids_file') # Preprocessing Part I parser.add_argument('-t', '--min_tokens', type=int, default=5) parser.add_argument('-r', '--max_urls', type=int, default=2) parser.add_argument('-u', '--max_users', type=int, default=3) # Language Identification parser.add_argument('-prob', '--langid_min_prob', type=float, default=0.8) # Tokenization parser.add_argument('-s', '--simple', dest='simple', action='store_true', help='selects simple tokenizer') parser.add_argument('-to', '--twokenize', dest='twokenize', action='store_true', help='Keep contractions') parser.add_argument('-tw', '--twokenize2', dest='twokenize2', action='store_true', help='Break apostroph words') # Preprocessing Part II parser.add_argument('-l', '--lowercase', dest='lowercase', action='store_true', default=False) parser.add_argument('-b', '--break_hash', dest='break_hash', action='store_true', default=False) # Sentiment Dataset Generation parser.add_argument('-m', '--prob_smiley', type=float, default=0.4, help='probability to keep smiley') args = parser.parse_args() lang_codes = unicode(args.lang_codes).split(',') prob_smiley = args.prob_smiley tweets_file = args.tweets_file tweets_path = args.output_dir newsfeed_path = args.news_feed_path min_tokens = args.min_tokens max_num_urls = args.max_urls max_num_users = args.max_users langid_min_prob = args.langid_min_prob num_jobs = args.num_jobs qsize = args.queue_size tokenize_function = twokenize.tokenize2 if args.simple: tokenize_function = word_tokenize if args.twokenize: tokenize_function = twokenize.tokenize if args.twokenize2: tokenize_function = twokenize.tokenize2 if args.exclude_ids and not args.ids_file: print('no ids file provided') sys.exit(0) filename = os.path.basename(tweets_file) # Read newsfeed pickled tweets if newsfeed_path is not None: convert_tweets(newsfeed_path, tweets_path, filename) # added if tweets file argument is not correct, try if it works tweets_file = os.path.join(tweets_path, filename) for lang_code in lang_codes: print('Using %s as language code' % lang_code) # new dir for each language tmpdir = os.path.join(tweets_path, 'generated_tweets_' + str(lang_code)) if not os.path.isdir(tmpdir): os.makedirs(tmpdir) tweets_path = tmpdir # Filter Based on Language func = partial(filter_line, lang_code) outfile = 'tweets.' + lang_code + '.json.gz' outfile = os.path.join(tweets_path, outfile) multiprocess_filter_lang = MultiprocessFiles(tweets_file, outfile, func, num_procs=num_jobs, queue_size=qsize) multiprocess_filter_lang.run() # Exclude ids if args.exclude_ids: idlist = open(args.ids_file, 'r').readlines() idlist = set([int(x.strip()) for x in idlist]) func = partial(exclude_ids, idlist) infile = outfile outfile = 'tweets.' + lang_code + '.exc.json.gz' outfile = os.path.join(tweets_path, outfile) exclude = MultiprocessFiles(infile, outfile, func, num_procs=num_jobs, queue_size=qsize) exclude.run() os.remove(infile) # Preprocess Text input_file = outfile output_file = 'tweets.' + lang_code + '.pp.json.gz' output_file = os.path.join(tweets_path, output_file) func = partial(preprocess_tweet, min_tokens, max_num_urls, max_num_users, replacements) preprocess = MultiprocessFiles(input_file, output_file, func, num_procs=num_jobs, queue_size=qsize) preprocess.run() os.remove(input_file) # Lang Identification infile = output_file dest_file = 'tweets.' + lang_code + '.pp.lid.json.gz' dest_file = os.path.join(tweets_path, dest_file) func = partial(filter_classify_lang_line, lang_code, langid_min_prob, replacements) classify = MultiprocessFiles(infile, dest_file, func, num_procs=num_jobs, queue_size=qsize) classify.run() os.remove(infile) # Preprocess 2 if args.lowercase or args.break_hash or replacements['number']: infile = dest_file outfile = 'tweets.lowercase.' + lang_code + '.json.gz' outfile = os.path.join(tweets_path, outfile) pp2_func = partial(preprocess2, args.lowercase, args.break_hash, replacements) pp2 = MultiprocessFiles(infile, outfile, pp2_func, num_procs=num_jobs, queue_size=qsize) pp2.run() os.remove(infile) # Tokenization if args.lowercase or args.break_hash or replacements['number']: source = outfile else: source = dest_file dest = 'tweets.' + lang_code + '.pp.lid.tok.json.gz' dest = os.path.join(tweets_path, dest) func = partial(tokenize_tweet, tokenize_function) tokenizer = MultiprocessFiles(source, dest, func, num_procs=num_jobs, queue_size=qsize) tokenizer.run() os.remove(source) # Filter unique infile = dest outfile = 'tweets.' + lang_code + '.final.json.gz' outfile = os.path.join(tweets_path, outfile) tweet_ids = set() make_unique(infile, outfile, tweet_ids) os.remove(infile) # Filter emoticons pos_path = os.path.join(tweets_path, 'pos.txt') f_pos = codecs.open(pos_path, 'w', encoding='utf-8') neg_path = os.path.join(tweets_path, 'neg.txt') f_neg = codecs.open(neg_path, 'w', encoding='utf-8') other_path = os.path.join(tweets_path, 'other.txt') f_other = codecs.open(other_path, 'w', encoding='utf-8') # Read and Process with gzip.open(outfile, 'r') as f: for line in f: res = process_line(prob_smiley, line) n += 1 if res is None: continue if res[0] is None: f_other.write(res[1] + u'\n') if res[0] == POS: n_pos += 1 f_pos.write(res[1] + u'\n') if res[0] == NEG: f_neg.write(res[1] + u'\n') n_neg += 1 # explicitly close files f_pos.close() f_neg.close() f_other.close()