def main():
    """ main """
    lang_codes = ['en']
    langid_min_prob = 0.7
    replacements = json.load(open('replacements.json'))

    parser = argparse.ArgumentParser()
    parser.add_argument('tweet_infiles',
                        help='input file paths comma seperated')
    parser.add_argument('dest_files', help='output file paths comma seperated')
    parser.add_argument('-l', '--lang_codes')
    parser.add_argument('-p',
                        '---langid_min_prob',
                        type=float,
                        help='outputs only tweets that have langid_min_prob \
                              or higher probability')
    parser.add_argument('-n',
                        '--num_jobs',
                        type=int,
                        default=0,
                        help='number of worker processes to use. Default: \
                              number of cores')
    parser.add_argument('-s', '--queue_size', type=int, default=2000)
    args = parser.parse_args()

    tweet_files = args.tweet_infiles.split(',')
    dest_files = args.dest_files.split(',')

    if args.lang_codes:
        lang_codes = args.lang_codes.split(',')

    if not len(tweet_files) == len(dest_files):
        print('tweet_files and dest_files are different sizes')
        sys.exit(0)

    if not len(dest_files) == len(lang_codes):
        print('different number of files and language codes')
        sys.exit(0)

    if args.langid_min_prob:
        langid_min_prob = args.langid_min_prob

    for source, dest, lang in zip(tweet_files, dest_files, lang_codes):
        func = partial(filter_classify_lang_line, lang, langid_min_prob,
                       replacements)
        multiprocess = MultiprocessFiles(source,
                                         dest,
                                         func,
                                         num_procs=args.num_jobs,
                                         queue_size=args.queue_size)
        multiprocess.run()
Esempio n. 2
0
def main():
    """ main """
    parser = argparse.ArgumentParser()
    parser.add_argument('tweet_infiles', help='input files comma seperated')
    parser.add_argument('dest_files', help='output files comma seperated')

    parser.add_argument('-s', '--simple', dest='simple', action='store_true',
                        help='selects simple tokenizer instead of twokenizer')

    parser.add_argument('-t', '--twokenize', dest='twokenize',
                        action='store_true',
                        help='twokenizer that does not break apostroph words')

    parser.add_argument('-a', '--twokenize2', dest='twokenize2',
                        action='store_true',
                        help='twokenizer that breaks apostroph words')

    parser.add_argument('-n', '--num_jobs', type=int, default=0,
                        help='number of worker processes to use. Default: \
                              number of cores')
    parser.add_argument('-q', '--queue_size', type=int, default=2000)

    args = parser.parse_args()

    tweet_files = args.tweet_infiles.split(',')
    dest_files = args.dest_files.split(',')

    if not len(tweet_files) == len(dest_files):
        print('tweet_files and dest_files are different sizes')
        sys.exit(0)

    tokenize_function = twokenize.tokenize2
    if args.simple:
        tokenize_function = word_tokenize
        print("Simple Tokenize")
    if args.twokenize:
        tokenize_function = twokenize.tokenize
        print("Tokenize")
    if args.twokenize2:
        tokenize_function = twokenize.tokenize2
        print("Tokenize 2")

    func = partial(tokenize_tweet, tokenize_function)

    for source, dest in zip(tweet_files, dest_files):
        multiprocess = MultiprocessFiles(source, dest, func, 
                                         num_procs=args.num_jobs,
                                         queue_size=args.queue_size)
        multiprocess.run()
Esempio n. 3
0
def main():
    """ main """
    # default parameters
    min_tokens = 5
    max_num_urls = 2
    max_num_users = 3
    replacements = json.load(open('replacements.json'))

    parser = argparse.ArgumentParser()
    parser.add_argument('input_files', help='input file paths comma seperated')
    parser.add_argument('output_files',
                        help='output file paths comma seperated')
    parser.add_argument('-t', '--min_tokens', type=int)
    parser.add_argument('-r', '--max_urls', type=int)
    parser.add_argument('-u', '--max_users', type=int)
    parser.add_argument('-n',
                        '--num_jobs',
                        type=int,
                        default=0,
                        help='number of worker processes to use. Default: \
                              number of cores')
    parser.add_argument('-s', '--queue_size', type=int, default=2000)
    args = parser.parse_args()

    if args.min_tokens:
        min_tokens = args.min_tokens

    if args.max_urls:
        max_num_urls = args.max_urls

    if args.max_users:
        max_num_users = args.max_users

    infiles = args.input_files.split(',')
    outfiles = args.output_files.split(',')

    if not len(infiles) == len(outfiles):
        print('Input files and output_files do not match in size')
        sys.exit(0)

    func = partial(preprocess_tweet, min_tokens, max_num_urls, max_num_users,
                   replacements)
    for infile, outfile in zip(infiles, outfiles):
        multiprocess = MultiprocessFiles(infile,
                                         outfile,
                                         func,
                                         num_procs=args.num_jobs,
                                         queue_size=args.queue_size)
        multiprocess.run()
def main():
    """ main """
    lang_codes = ['en']
    langid_min_prob = 0.7
    replacements = json.load(open('replacements.json'))

    parser = argparse.ArgumentParser()
    parser.add_argument('tweet_infiles',
                        help='input file paths comma seperated')
    parser.add_argument('dest_files', help='output file paths comma seperated')
    parser.add_argument('-l', '--lang_codes')
    parser.add_argument('-p', '---langid_min_prob', type=float,
                        help='outputs only tweets that have langid_min_prob \
                              or higher probability')
    parser.add_argument('-n', '--num_jobs', type=int, default=0,
                        help='number of worker processes to use. Default: \
                              number of cores')
    parser.add_argument('-s', '--queue_size', type=int, default=2000)
    args = parser.parse_args()

    tweet_files = args.tweet_infiles.split(',')
    dest_files = args.dest_files.split(',')

    if args.lang_codes:
        lang_codes = args.lang_codes.split(',')

    if not len(tweet_files) == len(dest_files):
        print('tweet_files and dest_files are different sizes')
        sys.exit(0)

    if not len(dest_files) == len(lang_codes):
        print('different number of files and language codes')
        sys.exit(0)

    if args.langid_min_prob:
        langid_min_prob = args.langid_min_prob

    for source, dest, lang in zip(tweet_files, dest_files, lang_codes):
        func = partial(filter_classify_lang_line, lang, langid_min_prob,
                       replacements)
        multiprocess = MultiprocessFiles(source, dest, func,
                                         num_procs=args.num_jobs,
                                         queue_size=args.queue_size)
        multiprocess.run()
Esempio n. 5
0
def main():
    """ main """
    # default parameters
    min_tokens = 5
    max_num_urls = 2
    max_num_users = 3
    replacements = json.load(open('replacements.json'))

    parser = argparse.ArgumentParser()
    parser.add_argument('input_files', help='input file paths comma seperated')
    parser.add_argument('output_files',
                        help='output file paths comma seperated')
    parser.add_argument('-t', '--min_tokens', type=int)
    parser.add_argument('-r', '--max_urls', type=int)
    parser.add_argument('-u', '--max_users', type=int)
    parser.add_argument('-n', '--num_jobs', type=int, default=0,
                        help='number of worker processes to use. Default: \
                              number of cores')
    parser.add_argument('-s', '--queue_size', type=int, default=2000)
    args = parser.parse_args()

    if args.min_tokens:
        min_tokens = args.min_tokens

    if args.max_urls:
        max_num_urls = args.max_urls

    if args.max_users:
        max_num_users = args.max_users

    infiles = args.input_files.split(',')
    outfiles = args.output_files.split(',')

    if not len(infiles) == len(outfiles):
        print('Input files and output_files do not match in size')
        sys.exit(0)

    func = partial(preprocess_tweet, min_tokens,
                   max_num_urls, max_num_users, replacements)
    for infile, outfile in zip(infiles, outfiles):
        multiprocess = MultiprocessFiles(infile, outfile, func, 
                                         num_procs=args.num_jobs,
                                         queue_size=args.queue_size)
        multiprocess.run()
Esempio n. 6
0
def main():
    """ main """
    parser = argparse.ArgumentParser()

    parser.add_argument('-n', '--num_jobs', type=int, default=0,
                        help='number of worker processes to use. Default: \
                              number of cores')
    parser.add_argument('-s', '--queue_size', type=int, default=2000)

    parser.add_argument('-b', '--break_hashtags', action="store_true", 
                        dest="break_hashtags", default=False)

    parser.add_argument('-l', '--lowercase', action="store_true", 
                        dest="lowercase", default=False)
    
    parser.add_argument('input_tweet_file')
    parser.add_argument('output_file')
    args = parser.parse_args()

    infile = args.input_tweet_file
    outfile = args.output_file

    replacements = json.load(open('replacements.json'))

    if (not args.lowercase) and (not args.break_hashtags) and \
            (not replacements['number']):
        print('Nothing to do')
        sys.exit(0)

    pp2 = partial(preprocess2, args.lowercase, args.break_hashtags, 
                  replacements)

    multiprocess = MultiprocessFiles(infile, outfile, pp2, 
                                     queue_size=args.queue_size,
                                     num_procs=args.num_jobs)
    multiprocess.run()
Esempio n. 7
0
def main():
    """ main """
    replacements = json.load(open('replacements.json'))
    # Stats Vars
    n_pos = 0
    n_neg = 0
    n = 0
    POS = True
    NEG = False

    parser = argparse.ArgumentParser()

    # Basic
    parser.add_argument('tweets_file')
    parser.add_argument('-p', '--news_feed_path', default=None)
    parser.add_argument('lang_codes', help='lang codes comma-seperated')
    parser.add_argument('-j',
                        '--num_jobs',
                        default=0,
                        type=int,
                        help='0 uses all cores available')
    parser.add_argument('-q', '--queue_size', type=int, default=2000)
    parser.add_argument('output_dir')

    # Exclude Tweets
    parser.add_argument('-e',
                        '--exclude_ids',
                        action='store_true',
                        default=False)
    parser.add_argument('-f', '--ids_file')

    # Preprocessing Part I
    parser.add_argument('-t', '--min_tokens', type=int, default=5)
    parser.add_argument('-r', '--max_urls', type=int, default=2)
    parser.add_argument('-u', '--max_users', type=int, default=3)

    # Language Identification
    parser.add_argument('-prob', '--langid_min_prob', type=float, default=0.8)

    # Tokenization
    parser.add_argument('-s',
                        '--simple',
                        dest='simple',
                        action='store_true',
                        help='selects simple tokenizer')
    parser.add_argument('-to',
                        '--twokenize',
                        dest='twokenize',
                        action='store_true',
                        help='Keep contractions')
    parser.add_argument('-tw',
                        '--twokenize2',
                        dest='twokenize2',
                        action='store_true',
                        help='Break apostroph words')

    # Preprocessing Part II
    parser.add_argument('-l',
                        '--lowercase',
                        dest='lowercase',
                        action='store_true',
                        default=False)
    parser.add_argument('-b',
                        '--break_hash',
                        dest='break_hash',
                        action='store_true',
                        default=False)

    # Sentiment Dataset Generation
    parser.add_argument('-m',
                        '--prob_smiley',
                        type=float,
                        default=0.4,
                        help='probability to keep smiley')

    args = parser.parse_args()

    lang_codes = unicode(args.lang_codes).split(',')
    prob_smiley = args.prob_smiley
    tweets_file = args.tweets_file
    tweets_path = args.output_dir
    newsfeed_path = args.news_feed_path
    min_tokens = args.min_tokens
    max_num_urls = args.max_urls
    max_num_users = args.max_users
    langid_min_prob = args.langid_min_prob
    num_jobs = args.num_jobs
    qsize = args.queue_size
    tokenize_function = twokenize.tokenize2
    if args.simple:
        tokenize_function = word_tokenize
    if args.twokenize:
        tokenize_function = twokenize.tokenize
    if args.twokenize2:
        tokenize_function = twokenize.tokenize2
    if args.exclude_ids and not args.ids_file:
        print('no ids file provided')
        sys.exit(0)

    filename = os.path.basename(tweets_file)

    # Read newsfeed pickled tweets
    if newsfeed_path is not None:
        convert_tweets(newsfeed_path, tweets_path, filename)
        # added if tweets file argument is not correct, try if it works
        tweets_file = os.path.join(tweets_path, filename)

    for lang_code in lang_codes:
        print('Using %s as language code' % lang_code)

        # new dir for each language
        tmpdir = os.path.join(tweets_path,
                              'generated_tweets_' + str(lang_code))
        if not os.path.isdir(tmpdir):
            os.makedirs(tmpdir)
        tweets_path = tmpdir

        # Filter Based on Language
        func = partial(filter_line, lang_code)
        outfile = 'tweets.' + lang_code + '.json.gz'
        outfile = os.path.join(tweets_path, outfile)
        multiprocess_filter_lang = MultiprocessFiles(tweets_file,
                                                     outfile,
                                                     func,
                                                     num_procs=num_jobs,
                                                     queue_size=qsize)
        multiprocess_filter_lang.run()

        # Exclude ids
        if args.exclude_ids:
            idlist = open(args.ids_file, 'r').readlines()
            idlist = set([int(x.strip()) for x in idlist])
            func = partial(exclude_ids, idlist)

            infile = outfile
            outfile = 'tweets.' + lang_code + '.exc.json.gz'
            outfile = os.path.join(tweets_path, outfile)
            exclude = MultiprocessFiles(infile,
                                        outfile,
                                        func,
                                        num_procs=num_jobs,
                                        queue_size=qsize)
            exclude.run()

            os.remove(infile)

        # Preprocess Text
        input_file = outfile
        output_file = 'tweets.' + lang_code + '.pp.json.gz'
        output_file = os.path.join(tweets_path, output_file)
        func = partial(preprocess_tweet, min_tokens, max_num_urls,
                       max_num_users, replacements)
        preprocess = MultiprocessFiles(input_file,
                                       output_file,
                                       func,
                                       num_procs=num_jobs,
                                       queue_size=qsize)
        preprocess.run()

        os.remove(input_file)

        # Lang Identification
        infile = output_file
        dest_file = 'tweets.' + lang_code + '.pp.lid.json.gz'
        dest_file = os.path.join(tweets_path, dest_file)

        func = partial(filter_classify_lang_line, lang_code, langid_min_prob,
                       replacements)

        classify = MultiprocessFiles(infile,
                                     dest_file,
                                     func,
                                     num_procs=num_jobs,
                                     queue_size=qsize)
        classify.run()

        os.remove(infile)

        # Preprocess 2
        if args.lowercase or args.break_hash or replacements['number']:
            infile = dest_file
            outfile = 'tweets.lowercase.' + lang_code + '.json.gz'
            outfile = os.path.join(tweets_path, outfile)
            pp2_func = partial(preprocess2, args.lowercase, args.break_hash,
                               replacements)
            pp2 = MultiprocessFiles(infile,
                                    outfile,
                                    pp2_func,
                                    num_procs=num_jobs,
                                    queue_size=qsize)
            pp2.run()

            os.remove(infile)

        # Tokenization
        if args.lowercase or args.break_hash or replacements['number']:
            source = outfile
        else:
            source = dest_file
        dest = 'tweets.' + lang_code + '.pp.lid.tok.json.gz'
        dest = os.path.join(tweets_path, dest)

        func = partial(tokenize_tweet, tokenize_function)
        tokenizer = MultiprocessFiles(source,
                                      dest,
                                      func,
                                      num_procs=num_jobs,
                                      queue_size=qsize)
        tokenizer.run()

        os.remove(source)

        # Filter unique
        infile = dest
        outfile = 'tweets.' + lang_code + '.final.json.gz'
        outfile = os.path.join(tweets_path, outfile)

        tweet_ids = set()
        make_unique(infile, outfile, tweet_ids)

        os.remove(infile)

        # Filter emoticons
        pos_path = os.path.join(tweets_path, 'pos.txt')
        f_pos = codecs.open(pos_path, 'w', encoding='utf-8')

        neg_path = os.path.join(tweets_path, 'neg.txt')
        f_neg = codecs.open(neg_path, 'w', encoding='utf-8')

        other_path = os.path.join(tweets_path, 'other.txt')
        f_other = codecs.open(other_path, 'w', encoding='utf-8')

        # Read and Process
        with gzip.open(outfile, 'r') as f:
            for line in f:
                res = process_line(prob_smiley, line)
                n += 1

                if res is None:
                    continue
                if res[0] is None:
                    f_other.write(res[1] + u'\n')
                if res[0] == POS:
                    n_pos += 1
                    f_pos.write(res[1] + u'\n')
                if res[0] == NEG:
                    f_neg.write(res[1] + u'\n')
                    n_neg += 1

        # explicitly close files
        f_pos.close()
        f_neg.close()
        f_other.close()
def main():
    """ main """
    replacements = json.load(open('replacements.json'))
    # Stats Vars
    n_pos = 0
    n_neg = 0
    n = 0
    POS = True
    NEG = False

    parser = argparse.ArgumentParser()

    # Basic
    parser.add_argument('tweets_file')
    parser.add_argument('-p', '--news_feed_path', default=None)
    parser.add_argument('lang_codes', help='lang codes comma-seperated')
    parser.add_argument('-j', '--num_jobs', default=0, type=int,
                        help='0 uses all cores available')
    parser.add_argument('-q', '--queue_size', type=int, default=2000)
    parser.add_argument('output_dir')

    # Exclude Tweets
    parser.add_argument('-e', '--exclude_ids', action='store_true',
                        default=False)
    parser.add_argument('-f', '--ids_file')

    # Preprocessing Part I
    parser.add_argument('-t', '--min_tokens', type=int, default=5)
    parser.add_argument('-r', '--max_urls', type=int, default=2)
    parser.add_argument('-u', '--max_users', type=int, default=3)

    # Language Identification
    parser.add_argument('-prob', '--langid_min_prob', type=float, default=0.8)

    # Tokenization
    parser.add_argument('-s', '--simple', dest='simple', action='store_true',
                        help='selects simple tokenizer')
    parser.add_argument('-to', '--twokenize', dest='twokenize',
                        action='store_true',
                        help='Keep contractions')
    parser.add_argument('-tw', '--twokenize2', dest='twokenize2',
                        action='store_true',
                        help='Break apostroph words')
 
    # Preprocessing Part II
    parser.add_argument('-l', '--lowercase', dest='lowercase',
                        action='store_true', default=False)
    parser.add_argument('-b', '--break_hash', dest='break_hash',
                        action='store_true', default=False)

    # Sentiment Dataset Generation
    parser.add_argument('-m', '--prob_smiley', type=float, default=0.4,
                        help='probability to keep smiley')

    args = parser.parse_args()

    lang_codes = unicode(args.lang_codes).split(',')
    prob_smiley = args.prob_smiley
    tweets_file = args.tweets_file
    tweets_path = args.output_dir
    newsfeed_path = args.news_feed_path
    min_tokens = args.min_tokens
    max_num_urls = args.max_urls
    max_num_users = args.max_users
    langid_min_prob = args.langid_min_prob
    num_jobs = args.num_jobs
    qsize = args.queue_size
    tokenize_function = twokenize.tokenize2
    if args.simple:
        tokenize_function = word_tokenize
    if args.twokenize:
        tokenize_function = twokenize.tokenize
    if args.twokenize2:
        tokenize_function = twokenize.tokenize2
    if args.exclude_ids and not args.ids_file:
        print('no ids file provided')
        sys.exit(0)

    filename = os.path.basename(tweets_file)

    # Read newsfeed pickled tweets
    if newsfeed_path is not None:
        convert_tweets(newsfeed_path, tweets_path, filename)
        # added if tweets file argument is not correct, try if it works
        tweets_file = os.path.join(tweets_path, filename)

    for lang_code in lang_codes:
        print('Using %s as language code' % lang_code)

        # new dir for each language
        tmpdir = os.path.join(tweets_path, 'generated_tweets_' +
                              str(lang_code))
        if not os.path.isdir(tmpdir):
            os.makedirs(tmpdir)
        tweets_path = tmpdir

        # Filter Based on Language
        func = partial(filter_line, lang_code)
        outfile = 'tweets.' + lang_code + '.json.gz'
        outfile = os.path.join(tweets_path, outfile)
        multiprocess_filter_lang = MultiprocessFiles(tweets_file, outfile, func,
                                                     num_procs=num_jobs,
                                                     queue_size=qsize)
        multiprocess_filter_lang.run()

        # Exclude ids
        if args.exclude_ids:
            idlist = open(args.ids_file, 'r').readlines()
            idlist = set([int(x.strip()) for x in idlist])
            func = partial(exclude_ids, idlist)

            infile = outfile
            outfile = 'tweets.' + lang_code + '.exc.json.gz'
            outfile = os.path.join(tweets_path, outfile)
            exclude = MultiprocessFiles(infile, outfile, func,
                                        num_procs=num_jobs,
                                        queue_size=qsize)
            exclude.run()

            os.remove(infile)

        # Preprocess Text
        input_file = outfile
        output_file = 'tweets.' + lang_code + '.pp.json.gz'
        output_file = os.path.join(tweets_path, output_file)
        func = partial(preprocess_tweet, min_tokens,
                       max_num_urls, max_num_users, replacements)
        preprocess = MultiprocessFiles(input_file, output_file, func,
                                       num_procs=num_jobs, queue_size=qsize)
        preprocess.run()

        os.remove(input_file)

        # Lang Identification
        infile = output_file
        dest_file = 'tweets.' + lang_code + '.pp.lid.json.gz'
        dest_file = os.path.join(tweets_path, dest_file)

        func = partial(filter_classify_lang_line, lang_code, langid_min_prob,
                       replacements)

        classify = MultiprocessFiles(infile, dest_file, func,
                                     num_procs=num_jobs, queue_size=qsize)
        classify.run()

        os.remove(infile)

        # Preprocess 2
        if args.lowercase or args.break_hash or replacements['number']:
            infile = dest_file
            outfile = 'tweets.lowercase.' + lang_code + '.json.gz'
            outfile = os.path.join(tweets_path, outfile)
            pp2_func = partial(preprocess2, args.lowercase, args.break_hash,
                               replacements)
            pp2 = MultiprocessFiles(infile, outfile, pp2_func,
                                    num_procs=num_jobs, queue_size=qsize)
            pp2.run()

            os.remove(infile)

        # Tokenization
        if args.lowercase or args.break_hash or replacements['number']:
            source = outfile
        else:
            source = dest_file
        dest = 'tweets.' + lang_code + '.pp.lid.tok.json.gz'
        dest = os.path.join(tweets_path, dest)

        func = partial(tokenize_tweet, tokenize_function)
        tokenizer = MultiprocessFiles(source, dest, func, num_procs=num_jobs,
                                      queue_size=qsize)
        tokenizer.run()

        os.remove(source)

        # Filter unique
        infile = dest
        outfile = 'tweets.' + lang_code + '.final.json.gz'
        outfile = os.path.join(tweets_path, outfile)

        tweet_ids = set()
        make_unique(infile, outfile, tweet_ids)

        os.remove(infile)

        # Filter emoticons
        pos_path = os.path.join(tweets_path, 'pos.txt')
        f_pos = codecs.open(pos_path, 'w', encoding='utf-8')

        neg_path = os.path.join(tweets_path, 'neg.txt')
        f_neg = codecs.open(neg_path, 'w', encoding='utf-8')

        other_path = os.path.join(tweets_path, 'other.txt')
        f_other = codecs.open(other_path, 'w', encoding='utf-8')

        # Read and Process
        with gzip.open(outfile, 'r') as f:
            for line in f:
                res = process_line(prob_smiley, line)
                n += 1

                if res is None:
                    continue
                if res[0] is None:
                    f_other.write(res[1] + u'\n')
                if res[0] == POS:
                    n_pos += 1
                    f_pos.write(res[1] + u'\n')
                if res[0] == NEG:
                    f_neg.write(res[1] + u'\n')
                    n_neg += 1

        # explicitly close files
        f_pos.close()
        f_neg.close()
        f_other.close()